From dc9711f4121e81cd4ae4dc94f5dfd3fe46c8d94b Mon Sep 17 00:00:00 2001 From: Mr-Neutr0n <64578610+Mr-Neutr0n@users.noreply.github.com> Date: Tue, 10 Feb 2026 00:11:03 +0530 Subject: [PATCH] Add safety bounds to dynamic_preprocess to prevent OOM with extreme aspect ratios Images with extreme aspect ratios (e.g., very wide panoramas or tall screenshots) can cause excessive memory allocation in dynamic_preprocess because the function generates patch grid ratios without any aspect ratio filtering. This can lead to OOM errors, especially when max_num is set to high values. Changes: - Cap max_num at MAX_PATCHES_LIMIT (24) to prevent runaway patch counts - Filter out target ratios where either dimension ratio exceeds MAX_ASPECT_RATIO_THRESHOLD (200) to avoid degenerate patch grids - Add a safety fallback to (1,1) if all ratios are filtered out - Apply the fix consistently across all three copies of the function: internvl_chat, internvl_chat_gpt_oss, and streamlit_demo --- internvl_chat/internvl/train/dataset.py | 22 +++++++++++++++++++ .../internvl/train/dataset.py | 22 +++++++++++++++++++ streamlit_demo/model_worker.py | 22 +++++++++++++++++++ 3 files changed, 66 insertions(+) diff --git a/internvl_chat/internvl/train/dataset.py b/internvl_chat/internvl/train/dataset.py index cffdcf7ac..340a7b357 100644 --- a/internvl_chat/internvl/train/dataset.py +++ b/internvl_chat/internvl/train/dataset.py @@ -827,16 +827,38 @@ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_ return best_ratio +# Maximum number of patches allowed to prevent OOM with large max_num values. +MAX_PATCHES_LIMIT = 24 +# Maximum aspect ratio (width/height or height/width) allowed for target +# patch grids. Ratios beyond this threshold are filtered out to avoid +# excessive memory allocation when processing images with extreme proportions. +MAX_ASPECT_RATIO_THRESHOLD = 200 + + def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False): orig_width, orig_height = image.size aspect_ratio = orig_width / orig_height + # Enforce an upper bound on max_num to prevent OOM from runaway patch counts + max_num = min(max_num, MAX_PATCHES_LIMIT) + # calculate the existing image aspect ratio target_ratios = set( (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if i * j <= max_num and i * j >= min_num) target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) + # Filter out target ratios with extreme aspect ratios to prevent OOM. + # For example, with max_num=12, a ratio like (12,1) produces a 5376x448 + # intermediate image which wastes memory without adding visual information. + target_ratios = [r for r in target_ratios + if r[0] / r[1] <= MAX_ASPECT_RATIO_THRESHOLD + and r[1] / r[0] <= MAX_ASPECT_RATIO_THRESHOLD] + + # Safety fallback: if all ratios were filtered, use a 1:1 grid + if not target_ratios: + target_ratios = [(1, 1)] + # find the closest aspect ratio to the target target_aspect_ratio = find_closest_aspect_ratio( aspect_ratio, target_ratios, orig_width, orig_height, image_size) diff --git a/internvl_chat_gpt_oss/internvl/train/dataset.py b/internvl_chat_gpt_oss/internvl/train/dataset.py index d13fec012..033f6d7d4 100644 --- a/internvl_chat_gpt_oss/internvl/train/dataset.py +++ b/internvl_chat_gpt_oss/internvl/train/dataset.py @@ -726,16 +726,38 @@ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_ return best_ratio +# Maximum number of patches allowed to prevent OOM with large max_num values. +MAX_PATCHES_LIMIT = 24 +# Maximum aspect ratio (width/height or height/width) allowed for target +# patch grids. Ratios beyond this threshold are filtered out to avoid +# excessive memory allocation when processing images with extreme proportions. +MAX_ASPECT_RATIO_THRESHOLD = 200 + + def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False): orig_width, orig_height = image.size aspect_ratio = orig_width / orig_height + # Enforce an upper bound on max_num to prevent OOM from runaway patch counts + max_num = min(max_num, MAX_PATCHES_LIMIT) + # calculate the existing image aspect ratio target_ratios = set( (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if i * j <= max_num and i * j >= min_num) target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) + # Filter out target ratios with extreme aspect ratios to prevent OOM. + # For example, with max_num=12, a ratio like (12,1) produces a 5376x448 + # intermediate image which wastes memory without adding visual information. + target_ratios = [r for r in target_ratios + if r[0] / r[1] <= MAX_ASPECT_RATIO_THRESHOLD + and r[1] / r[0] <= MAX_ASPECT_RATIO_THRESHOLD] + + # Safety fallback: if all ratios were filtered, use a 1:1 grid + if not target_ratios: + target_ratios = [(1, 1)] + # find the closest aspect ratio to the target target_aspect_ratio = find_closest_aspect_ratio( aspect_ratio, target_ratios, orig_width, orig_height, image_size) diff --git a/streamlit_demo/model_worker.py b/streamlit_demo/model_worker.py index aa6f3aa7b..aff902d38 100644 --- a/streamlit_demo/model_worker.py +++ b/streamlit_demo/model_worker.py @@ -68,16 +68,38 @@ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_ return best_ratio +# Maximum number of patches allowed to prevent OOM with large max_num values. +MAX_PATCHES_LIMIT = 24 +# Maximum aspect ratio (width/height or height/width) allowed for target +# patch grids. Ratios beyond this threshold are filtered out to avoid +# excessive memory allocation when processing images with extreme proportions. +MAX_ASPECT_RATIO_THRESHOLD = 200 + + def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False): orig_width, orig_height = image.size aspect_ratio = orig_width / orig_height + # Enforce an upper bound on max_num to prevent OOM from runaway patch counts + max_num = min(max_num, MAX_PATCHES_LIMIT) + # calculate the existing image aspect ratio target_ratios = set( (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if i * j <= max_num and i * j >= min_num) target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) + # Filter out target ratios with extreme aspect ratios to prevent OOM. + # For example, with max_num=12, a ratio like (12,1) produces a 5376x448 + # intermediate image which wastes memory without adding visual information. + target_ratios = [r for r in target_ratios + if r[0] / r[1] <= MAX_ASPECT_RATIO_THRESHOLD + and r[1] / r[0] <= MAX_ASPECT_RATIO_THRESHOLD] + + # Safety fallback: if all ratios were filtered, use a 1:1 grid + if not target_ratios: + target_ratios = [(1, 1)] + # find the closest aspect ratio to the target target_aspect_ratio = find_closest_aspect_ratio( aspect_ratio, target_ratios, orig_width, orig_height, image_size)