From e8ed31d33563996e6a42bb83d1b384733ffbe56c Mon Sep 17 00:00:00 2001
From: eust-w <longtao.wu@gmail.com>
Date: Tue, 23 Sep 2025 11:12:14 +0800
Subject: [PATCH] :bug: fix txt bug

---
 ops_mm_embedding_v1.py | 41 +++++++++++++++++++++++++++++------------
 run_server.sh          |  3 +++
 2 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/ops_mm_embedding_v1.py b/ops_mm_embedding_v1.py
index c5fc930..68dc2bd 100644
--- a/ops_mm_embedding_v1.py
+++ b/ops_mm_embedding_v1.py
@@ -22,6 +22,8 @@ class OpsMMEmbeddingV1(nn.Module):
         load_in_4bit: bool = False,
         load_in_8bit: bool = False,
         torch_dtype: Optional[torch.dtype] = torch.bfloat16,
+        processor_min_pixels: int = 128 * 28 * 28,
+        processor_max_pixels: int = 512 * 28 * 28,
     ):
         super().__init__()
         self.device = device
@@ -49,7 +51,12 @@ class OpsMMEmbeddingV1(nn.Module):
         if device_map is None:
             self.base_model = self.base_model.to(self.device)
 
-        self.processor = AutoProcessor.from_pretrained(model_name, min_pixels=256 * 28 * 28, max_pixels=1280 * 28 * 28)
+        # Use configurable pixel limits to control VRAM usage
+        self.processor = AutoProcessor.from_pretrained(
+            model_name,
+            min_pixels=processor_min_pixels,
+            max_pixels=processor_max_pixels,
+        )
         self.processor.tokenizer.padding_side = "left"
         self.eval()
 
@@ -139,17 +146,27 @@ class OpsMMEmbeddingV1(nn.Module):
             input_texts.append(msg)
             input_images.append(processed_image)
 
-        # Only pass to processor if we actually have images
-        processed_images = input_images if any(img is not None for img in input_images) else None
-
-        inputs = self.processor(
-            text=input_texts,
-            images=processed_images,
-            padding=True,
-            truncation=True,
-            max_length=self.max_length,
-            return_tensors="pt",
-        )
+        # Only pass images when present; some processors expect paired inputs and
+        # can raise unpack errors if we pass images=None with multi-modal processor.
+        has_images = any(img is not None for img in input_images)
+        if has_images:
+            processed_images = input_images
+            inputs = self.processor(
+                text=input_texts,
+                images=processed_images,
+                padding=True,
+                truncation=True,
+                max_length=self.max_length,
+                return_tensors="pt",
+            )
+        else:
+            inputs = self.processor(
+                text=input_texts,
+                padding=True,
+                truncation=True,
+                max_length=self.max_length,
+                return_tensors="pt",
+            )
         inputs = {k: v.to(self.device) for k, v in inputs.items()}
 
         with torch.inference_mode():
diff --git a/run_server.sh b/run_server.sh
index aa7c2a9..3651ea3 100644
--- a/run_server.sh
+++ b/run_server.sh
@@ -7,5 +7,8 @@ export CUDA_VISIBLE_DEVICES=0,1
 # Unbuffered stdout for real-time logs
 export PYTHONUNBUFFERED=1
 
+# Help PyTorch allocator avoid fragmentation (see OOM hint)
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+
 # Start the local web app
 exec python3 web_app_local.py "$@"