CVE-2026-0599
Description
A vulnerability in huggingface/text-generation-inference version 3.3.6 allows unauthenticated remote attackers to exploit unbounded external image fetching during input validation in VLM mode. The issue arises when the router scans inputs for Markdown image links and performs a blocking HTTP GET request, reading the entire response body into memory and cloning it before decoding. This behavior can lead to resource exhaustion, including network bandwidth saturation, memory inflation, and CPU overutilization. The vulnerability is triggered even if the request is later rejected for exceeding token limits. The default deployment configuration, which lacks memory usage limits and authentication, exacerbates the impact, potentially crashing the host machine. The issue is resolved in version 3.3.7.
Affected packages
Versions sourced from the GitHub Security Advisory.
| Package | Affected versions | Patched versions |
|---|---|---|
text-generationPyPI | < 3.3.7 | 3.3.7 |
Affected products
1- Range: v0.2.0, v0.2.1, v0.3.0, …
Patches
124ee40d143d8feat: support max_image_fetch_size to limit (#3339)
12 files changed · +78 −8
backends/llamacpp/src/main.rs+5 −0 modified@@ -157,6 +157,10 @@ struct Args { /// Maximum payload size in bytes. #[clap(default_value = "2000000", long, env)] payload_limit: usize, + + /// Maximum image fetch size in bytes. + #[clap(default_value = "1073741824", long, env)] + max_image_fetch_size: usize, } #[tokio::main] @@ -320,6 +324,7 @@ async fn main() -> Result<(), RouterError> { args.max_client_batch_size, args.usage_stats, args.payload_limit, + args.max_image_fetch_size, args.prometheus_port, ) .await?;
backends/trtllm/src/main.rs+4 −0 modified@@ -67,6 +67,8 @@ struct Args { usage_stats: UsageStatsLevel, #[clap(default_value = "2000000", long, env)] payload_limit: usize, + #[clap(default_value = "1073741824", long, env)] + max_image_fetch_size: usize, } async fn get_tokenizer(tokenizer_name: &str, revision: Option<&str>) -> Option<Tokenizer> { @@ -244,6 +246,7 @@ async fn main() -> Result<(), TensorRtLlmBackendError> { executor_worker, usage_stats, payload_limit, + max_image_fetch_size, } = args; // Launch Tokio runtime @@ -325,6 +328,7 @@ async fn main() -> Result<(), TensorRtLlmBackendError> { max_client_batch_size, usage_stats, payload_limit, + max_image_fetch_size, prometheus_port, ) .await?;
backends/v2/src/main.rs+4 −0 modified@@ -74,6 +74,8 @@ struct Args { usage_stats: usage_stats::UsageStatsLevel, #[clap(default_value = "2000000", long, env)] payload_limit: usize, + #[clap(default_value = "1073741824", long, env)] + max_image_fetch_size: usize, } #[derive(Debug, Subcommand)] @@ -120,6 +122,7 @@ async fn main() -> Result<(), RouterError> { max_client_batch_size, usage_stats, payload_limit, + max_image_fetch_size, } = args; if let Some(Commands::PrintSchema) = command { @@ -201,6 +204,7 @@ async fn main() -> Result<(), RouterError> { max_client_batch_size, usage_stats, payload_limit, + max_image_fetch_size, prometheus_port, ) .await?;
backends/v3/src/main.rs+4 −0 modified@@ -74,6 +74,8 @@ struct Args { usage_stats: usage_stats::UsageStatsLevel, #[clap(default_value = "2000000", long, env)] payload_limit: usize, + #[clap(default_value = "1073741824", long, env)] + max_image_fetch_size: usize, } #[derive(Debug, Subcommand)] @@ -120,6 +122,7 @@ async fn main() -> Result<(), RouterError> { max_client_batch_size, usage_stats, payload_limit, + max_image_fetch_size, } = args; if let Some(Commands::PrintSchema) = command { @@ -217,6 +220,7 @@ async fn main() -> Result<(), RouterError> { max_client_batch_size, usage_stats, payload_limit, + max_image_fetch_size, prometheus_port, ) .await?;
integration-tests/models/test_flash_llama_fp8.py+3 −0 modified@@ -13,6 +13,7 @@ async def flash_llama_fp8(flash_llama_fp8_handle): return flash_llama_fp8_handle.client +@pytest.mark.skip(reason="Issue with the model access") @pytest.mark.release @pytest.mark.asyncio @pytest.mark.private @@ -26,6 +27,7 @@ async def test_flash_llama_fp8(flash_llama_fp8, response_snapshot): assert response == response_snapshot +@pytest.mark.skip(reason="Issue with the model access") @pytest.mark.release @pytest.mark.asyncio @pytest.mark.private @@ -49,6 +51,7 @@ async def test_flash_llama_fp8_all_params(flash_llama_fp8, response_snapshot): assert response == response_snapshot +@pytest.mark.skip(reason="Issue with the model access") @pytest.mark.release @pytest.mark.asyncio @pytest.mark.private
integration-tests/models/test_flash_llama_marlin_24.py+3 −0 modified@@ -15,6 +15,7 @@ async def flash_llama_marlin(flash_llama_marlin24_handle): return flash_llama_marlin24_handle.client +@pytest.mark.skip(reason="Issue with the model access") @pytest.mark.release @pytest.mark.asyncio @pytest.mark.private @@ -27,6 +28,7 @@ async def test_flash_llama_marlin(flash_llama_marlin, response_snapshot): assert response == response_snapshot +@pytest.mark.skip(reason="Issue with the model access") @pytest.mark.release @pytest.mark.asyncio @pytest.mark.private @@ -50,6 +52,7 @@ async def test_flash_llama_marlin24_all_params(flash_llama_marlin, response_snap assert response == response_snapshot +@pytest.mark.skip(reason="Issue with the model access") @pytest.mark.release @pytest.mark.asyncio @pytest.mark.private
integration-tests/models/test_flash_llama.py+1 −1 modified@@ -3,7 +3,7 @@ @pytest.fixture(scope="module") def flash_llama_handle(launcher): - with launcher("huggingface/llama-7b", num_shard=2) as handle: + with launcher("huggyllama/llama-7b", num_shard=2) as handle: yield handle
router/src/chat.rs+1 −1 modified@@ -673,7 +673,7 @@ mod tests { let (name, arguments) = get_tool_call_content(&events[0]); if let Some(name) = name { assert_eq!(name, "get_current_weather"); - output_name.push_str(&name); + output_name.push_str(name); } output.push_str(arguments); } else {
router/src/server.rs+4 −0 modified@@ -1523,6 +1523,7 @@ pub async fn run( max_client_batch_size: usize, usage_stats_level: usage_stats::UsageStatsLevel, payload_limit: usize, + max_image_fetch_size: usize, prometheus_port: u16, ) -> Result<(), WebServerError> { // CORS allowed origins @@ -1827,6 +1828,7 @@ pub async fn run( compat_return_full_text, allow_origin, payload_limit, + max_image_fetch_size, prometheus_port, ) .await; @@ -1889,6 +1891,7 @@ async fn start( compat_return_full_text: bool, allow_origin: Option<AllowOrigin>, payload_limit: usize, + max_image_fetch_size: usize, prometheus_port: u16, ) -> Result<(), WebServerError> { // Determine the server port based on the feature and environment variable. @@ -1920,6 +1923,7 @@ async fn start( max_input_tokens, max_total_tokens, disable_grammar_support, + max_image_fetch_size, ); let infer = Infer::new(
router/src/validation.rs+47 −4 modified@@ -12,7 +12,7 @@ use rand::{thread_rng, Rng}; use serde_json::Value; /// Payload validation logic use std::cmp::min; -use std::io::Cursor; +use std::io::{Cursor, Read}; use std::iter; use std::sync::Arc; use thiserror::Error; @@ -51,6 +51,7 @@ impl Validation { max_input_length: usize, max_total_tokens: usize, disable_grammar_support: bool, + max_image_fetch_size: usize, ) -> Self { let workers = if let Tokenizer::Python { .. } = &tokenizer { 1 @@ -78,6 +79,7 @@ impl Validation { config_clone, preprocessor_config_clone, tokenizer_receiver, + max_image_fetch_size, ) }); } @@ -480,6 +482,7 @@ fn tokenizer_worker( config: Option<Config>, preprocessor_config: Option<HubPreprocessorConfig>, mut receiver: mpsc::UnboundedReceiver<TokenizerRequest>, + max_image_fetch_size: usize, ) { match tokenizer { Tokenizer::Python { @@ -503,6 +506,7 @@ fn tokenizer_worker( &tokenizer, config.as_ref(), preprocessor_config.as_ref(), + max_image_fetch_size, )) .unwrap_or(()) }) @@ -524,6 +528,7 @@ fn tokenizer_worker( &tokenizer, config.as_ref(), preprocessor_config.as_ref(), + max_image_fetch_size, )) .unwrap_or(()) }) @@ -562,10 +567,35 @@ fn format_to_mimetype(format: ImageFormat) -> String { .to_string() } -fn fetch_image(input: &str) -> Result<(Vec<u8>, String, usize, usize), ValidationError> { +fn fetch_image( + input: &str, + max_image_fetch_size: usize, +) -> Result<(Vec<u8>, String, usize, usize), ValidationError> { if input.starts_with(" || input.starts_with(" { let url = &input["..input.len() - 1]; - let data = reqwest::blocking::get(url)?.bytes()?; + let response = reqwest::blocking::get(url)?; + + // Check Content-Length header if present + if let Some(content_length) = response.content_length() { + if content_length as usize > max_image_fetch_size { + return Err(ValidationError::ImageTooLarge( + content_length as usize, + max_image_fetch_size, + )); + } + } + + // Read the body with size limit to prevent unbounded memory allocation + let mut data = Vec::new(); + let mut limited_reader = response.take((max_image_fetch_size + 1) as u64); + limited_reader.read_to_end(&mut data)?; + + if data.len() > max_image_fetch_size { + return Err(ValidationError::ImageTooLarge( + data.len(), + max_image_fetch_size, + )); + } let format = image::guess_format(&data)?; // TODO Remove this clone @@ -787,6 +817,7 @@ fn prepare_input<T: TokenizerTrait>( tokenizer: &T, config: Option<&Config>, preprocessor_config: Option<&HubPreprocessorConfig>, + max_image_fetch_size: usize, ) -> Result<(tokenizers::Encoding, Vec<Chunk>), ValidationError> { use Config::*; static RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"!\[\]\([^\)]*\)").unwrap()); @@ -805,7 +836,8 @@ fn prepare_input<T: TokenizerTrait>( input_chunks.push(Chunk::Text(inputs[start..chunk_start].to_string())); tokenizer_query.push_str(&inputs[start..chunk_start]); } - let (data, mimetype, height, width) = fetch_image(&inputs[chunk_start..chunk_end])?; + let (data, mimetype, height, width) = + fetch_image(&inputs[chunk_start..chunk_end], max_image_fetch_size)?; input_chunks.push(Chunk::Image(Image { data, mimetype })); tokenizer_query.push_str(&image_tokens(config, preprocessor_config, height, width)); start = chunk_end; @@ -990,6 +1022,10 @@ pub enum ValidationError { InvalidImageContent(String), #[error("Could not fetch image: {0}")] FailedFetchImage(#[from] reqwest::Error), + #[error("Image size {0} bytes exceeds maximum allowed size of {1} bytes")] + ImageTooLarge(usize, usize), + #[error("Failed to read image data: {0}")] + ImageReadError(#[from] std::io::Error), #[error("{0} modality is not supported")] UnsupportedModality(&'static str), } @@ -1023,6 +1059,7 @@ mod tests { max_input_length, max_total_tokens, disable_grammar_support, + 1024 * 1024 * 1024, // 1GB ); let max_new_tokens = 10; @@ -1058,6 +1095,7 @@ mod tests { max_input_length, max_total_tokens, disable_grammar_support, + 1024 * 1024 * 1024, // 1GB ); let max_new_tokens = 10; @@ -1092,6 +1130,7 @@ mod tests { max_input_length, max_total_tokens, disable_grammar_support, + 1024 * 1024 * 1024, // 1GB ); match validation .validate(GenerateRequest { @@ -1132,6 +1171,7 @@ mod tests { max_input_length, max_total_tokens, disable_grammar_support, + 1024 * 1024 * 1024, // 1GB ); match validation .validate(GenerateRequest { @@ -1203,6 +1243,7 @@ mod tests { max_input_length, max_total_tokens, disable_grammar_support, + 1024 * 1024 * 1024, // 1GB ); match validation .validate(GenerateRequest { @@ -1293,6 +1334,7 @@ mod tests { max_input_length, max_total_tokens, disable_grammar_support, + 1024 * 1024 * 1024, // 1GB ); let chunks = match validation @@ -1349,6 +1391,7 @@ mod tests { max_input_length, max_total_tokens, disable_grammar_support, + 1024 * 1024 * 1024, // 1GB ); let (encoding, chunks) = match validation
server/Makefile-flash-att-v2+1 −1 modified@@ -3,7 +3,7 @@ flash_att_v2_commit_rocm := 47bd46e0204a95762ae48712fd1a3978827c77fd build-flash-attention-v2-cuda: pip install -U packaging wheel - pip install flash-attn==$(flash_att_v2_commit_cuda) + pip install --no-build-isolation flash-attn==$(flash_att_v2_commit_cuda) install-flash-attention-v2-cuda: build-flash-attention-v2-cuda echo "Flash v2 installed"
server/tests/models/test_model.py+1 −1 modified@@ -14,7 +14,7 @@ def batch_type(self): def generate_token(self, batch): raise NotImplementedError - tokenizer = AutoTokenizer.from_pretrained("huggingface/llama-7b") + tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b") model = TestModel( "test_model_id",
Vulnerability mechanics
Generated by null/stub on May 9, 2026. Inputs: CWE entries + fix-commit diffs from this CVE's patches. Citations validated against bundle.
References
4News mentions
0No linked articles in our index yet.