From e8ed31d33563996e6a42bb83d1b384733ffbe56c Mon Sep 17 00:00:00 2001 From: eust-w Date: Tue, 23 Sep 2025 11:12:14 +0800 Subject: [PATCH] :bug: fix txt bug --- ops_mm_embedding_v1.py | 41 +++++++++++++++++++++++++++++------------ run_server.sh | 3 +++ 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/ops_mm_embedding_v1.py b/ops_mm_embedding_v1.py index c5fc930..68dc2bd 100644 --- a/ops_mm_embedding_v1.py +++ b/ops_mm_embedding_v1.py @@ -22,6 +22,8 @@ class OpsMMEmbeddingV1(nn.Module): load_in_4bit: bool = False, load_in_8bit: bool = False, torch_dtype: Optional[torch.dtype] = torch.bfloat16, + processor_min_pixels: int = 128 * 28 * 28, + processor_max_pixels: int = 512 * 28 * 28, ): super().__init__() self.device = device @@ -49,7 +51,12 @@ class OpsMMEmbeddingV1(nn.Module): if device_map is None: self.base_model = self.base_model.to(self.device) - self.processor = AutoProcessor.from_pretrained(model_name, min_pixels=256 * 28 * 28, max_pixels=1280 * 28 * 28) + # Use configurable pixel limits to control VRAM usage + self.processor = AutoProcessor.from_pretrained( + model_name, + min_pixels=processor_min_pixels, + max_pixels=processor_max_pixels, + ) self.processor.tokenizer.padding_side = "left" self.eval() @@ -139,17 +146,27 @@ class OpsMMEmbeddingV1(nn.Module): input_texts.append(msg) input_images.append(processed_image) - # Only pass to processor if we actually have images - processed_images = input_images if any(img is not None for img in input_images) else None - - inputs = self.processor( - text=input_texts, - images=processed_images, - padding=True, - truncation=True, - max_length=self.max_length, - return_tensors="pt", - ) + # Only pass images when present; some processors expect paired inputs and + # can raise unpack errors if we pass images=None with multi-modal processor. + has_images = any(img is not None for img in input_images) + if has_images: + processed_images = input_images + inputs = self.processor( + text=input_texts, + images=processed_images, + padding=True, + truncation=True, + max_length=self.max_length, + return_tensors="pt", + ) + else: + inputs = self.processor( + text=input_texts, + padding=True, + truncation=True, + max_length=self.max_length, + return_tensors="pt", + ) inputs = {k: v.to(self.device) for k, v in inputs.items()} with torch.inference_mode(): diff --git a/run_server.sh b/run_server.sh index aa7c2a9..3651ea3 100644 --- a/run_server.sh +++ b/run_server.sh @@ -7,5 +7,8 @@ export CUDA_VISIBLE_DEVICES=0,1 # Unbuffered stdout for real-time logs export PYTHONUNBUFFERED=1 +# Help PyTorch allocator avoid fragmentation (see OOM hint) +export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True + # Start the local web app exec python3 web_app_local.py "$@"