prithivMLmods commited on
Commit
eb6d2ca
·
verified ·
1 Parent(s): 7a98807

update app

Browse files
Files changed (1) hide show
  1. app.py +126 -358
app.py CHANGED
@@ -1,20 +1,15 @@
1
  import os
2
  import gc
3
- import re
4
  import json
5
- import uuid
6
  import time
7
  import base64
8
- import random
9
  from io import BytesIO
10
  from threading import Thread
11
 
12
  import gradio as gr
13
  import spaces
14
  import torch
15
- import numpy as np
16
- from PIL import Image, ImageOps
17
- import cv2
18
 
19
  from transformers import (
20
  Qwen2_5_VLForConditionalGeneration,
@@ -30,16 +25,6 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
30
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
31
  print("Using device:", device)
32
 
33
-
34
- def load_model(model_id, cls, **kwargs):
35
- return cls.from_pretrained(
36
- model_id,
37
- trust_remote_code=True,
38
- torch_dtype=torch.float16,
39
- **kwargs
40
- ).to(device).eval()
41
-
42
-
43
  MODEL_ID_N = "prithivMLmods/DeepCaption-VLA-7B"
44
  processor_n = AutoProcessor.from_pretrained(MODEL_ID_N, trust_remote_code=True)
45
  model_n = Qwen2_5_VLForConditionalGeneration.from_pretrained(
@@ -96,20 +81,13 @@ MODEL_MAP = {
96
  MODEL_CHOICES = list(MODEL_MAP.keys())
97
 
98
  image_examples = [
99
- {"query": "type out the messy hand-writing as accurately as you can.", "media": "images/1.jpg", "model": "coreOCR-7B-050325-preview", "mode": "image"},
100
- {"query": "count the number of birds and explain the scene in detail.", "media": "images/2.jpeg", "model": "DeepCaption-VLA-7B", "mode": "image"},
101
- {"query": "how far is the Goal from the penalty taker in this image?.", "media": "images/3.png", "model": "SpaceThinker-3B", "mode": "image"},
102
- {"query": "approximately how many meters apart are the chair and bookshelf?.", "media": "images/4.png", "model": "SkyCaptioner-V1", "mode": "image"},
103
- {"query": "how far is the man in the red hat from the pallet of boxes in feet?.", "media": "images/5.jpg", "model": "SpaceOm-3B", "mode": "image"},
104
- ]
105
-
106
- video_examples = [
107
- {"query": "give the highlights of the movie scene video.", "media": "videos/1.mp4", "model": "DeepCaption-VLA-7B", "mode": "video"},
108
- {"query": "explain the advertisement in detail.", "media": "videos/2.mp4", "model": "SkyCaptioner-V1", "mode": "video"},
109
  ]
110
 
111
- all_examples = image_examples + video_examples
112
-
113
 
114
  def pil_to_data_url(img: Image.Image, fmt="PNG"):
115
  buf = BytesIO()
@@ -128,27 +106,15 @@ def file_to_data_url(path):
128
  "jpeg": "image/jpeg",
129
  "png": "image/png",
130
  "webp": "image/webp",
131
- "mp4": "video/mp4",
132
- "mov": "video/quicktime",
133
- "webm": "video/webm",
134
- }.get(ext, "application/octet-stream")
135
  with open(path, "rb") as f:
136
  data = base64.b64encode(f.read()).decode()
137
  return f"data:{mime};base64,{data}"
138
 
139
 
140
- def make_thumb_b64(path, mode="image", max_dim=240):
141
  try:
142
- if mode == "video":
143
- cap = cv2.VideoCapture(path)
144
- ok, frame = cap.read()
145
- cap.release()
146
- if not ok:
147
- return ""
148
- frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
149
- img = Image.fromarray(frame).convert("RGB")
150
- else:
151
- img = Image.open(path).convert("RGB")
152
  img.thumbnail((max_dim, max_dim))
153
  return pil_to_data_url(img, "JPEG")
154
  except Exception as e:
@@ -158,15 +124,14 @@ def make_thumb_b64(path, mode="image", max_dim=240):
158
 
159
  def build_example_cards_html():
160
  cards = ""
161
- for i, ex in enumerate(all_examples):
162
- thumb = make_thumb_b64(ex["media"], ex["mode"])
163
  prompt_short = ex["query"][:72] + ("..." if len(ex["query"]) > 72 else "")
164
- media_badge = "VIDEO" if ex["mode"] == "video" else "IMAGE"
165
  cards += f"""
166
  <div class="example-card" data-idx="{i}">
167
  <div class="example-thumb-wrap">
168
  {"<img src='" + thumb + "' alt=''>" if thumb else "<div class='example-thumb-placeholder'>Preview</div>"}
169
- <div class="example-media-chip">{media_badge}</div>
170
  </div>
171
  <div class="example-meta-row">
172
  <span class="example-badge">{ex["model"]}</span>
@@ -185,18 +150,17 @@ def load_example_data(idx_str):
185
  idx = int(float(idx_str))
186
  except Exception:
187
  return json.dumps({"status": "error", "message": "Invalid example index"})
188
- if idx < 0 or idx >= len(all_examples):
189
  return json.dumps({"status": "error", "message": "Example index out of range"})
190
- ex = all_examples[idx]
191
  media_b64 = file_to_data_url(ex["media"])
192
  if not media_b64:
193
- return json.dumps({"status": "error", "message": f"Could not load example {ex['mode']}"})
194
  return json.dumps({
195
  "status": "ok",
196
  "query": ex["query"],
197
  "media": media_b64,
198
  "model": ex["model"],
199
- "mode": ex["mode"],
200
  "name": os.path.basename(ex["media"]),
201
  })
202
 
@@ -215,54 +179,6 @@ def b64_to_pil(b64_str):
215
  return None
216
 
217
 
218
- def b64_to_temp_video(b64_str):
219
- if not b64_str:
220
- return None
221
- try:
222
- if b64_str.startswith("data:"):
223
- header, data = b64_str.split(",", 1)
224
- mime = header.split(";")[0].replace("data:", "")
225
- else:
226
- data = b64_str
227
- mime = "video/mp4"
228
- ext = {
229
- "video/mp4": ".mp4",
230
- "video/webm": ".webm",
231
- "video/quicktime": ".mov",
232
- }.get(mime, ".mp4")
233
- raw = base64.b64decode(data)
234
- temp_dir = os.path.join("/tmp", "visionscope_r2_media")
235
- os.makedirs(temp_dir, exist_ok=True)
236
- path = os.path.join(temp_dir, f"{uuid.uuid4().hex}{ext}")
237
- with open(path, "wb") as f:
238
- f.write(raw)
239
- return path
240
- except Exception:
241
- return None
242
-
243
-
244
- def downsample_video(video_path):
245
- vidcap = cv2.VideoCapture(video_path)
246
- total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
247
- fps = vidcap.get(cv2.CAP_PROP_FPS) or 1.0
248
- frames = []
249
- frame_count = min(total_frames, 10) if total_frames > 0 else 0
250
- if frame_count == 0:
251
- vidcap.release()
252
- return frames
253
- frame_indices = np.linspace(0, total_frames - 1, frame_count, dtype=int)
254
- for i in frame_indices:
255
- vidcap.set(cv2.CAP_PROP_POS_FRAMES, int(i))
256
- success, image = vidcap.read()
257
- if success:
258
- image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
259
- pil_image = Image.fromarray(image)
260
- timestamp = round(float(i) / float(fps), 2)
261
- frames.append((pil_image, timestamp))
262
- vidcap.release()
263
- return frames
264
-
265
-
266
  def calc_timeout_image(model_name, text, image, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_timeout):
267
  try:
268
  return int(gpu_timeout)
@@ -270,13 +186,6 @@ def calc_timeout_image(model_name, text, image, max_new_tokens, temperature, top
270
  return 60
271
 
272
 
273
- def calc_timeout_video(model_name, text, video_path, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_timeout):
274
- try:
275
- return int(gpu_timeout)
276
- except Exception:
277
- return 60
278
-
279
-
280
  @spaces.GPU(duration=calc_timeout_image)
281
  def generate_image(model_name, text, image, max_new_tokens=1024, temperature=0.6, top_p=0.9, top_k=50, repetition_penalty=1.2, gpu_timeout=60):
282
  if not model_name or model_name not in MODEL_MAP:
@@ -339,102 +248,19 @@ def generate_image(model_name, text, image, max_new_tokens=1024, temperature=0.6
339
  torch.cuda.empty_cache()
340
 
341
 
342
- @spaces.GPU(duration=calc_timeout_video)
343
- def generate_video(model_name, text, video_path, max_new_tokens=1024, temperature=0.6, top_p=0.9, top_k=50, repetition_penalty=1.2, gpu_timeout=90):
344
- if not model_name or model_name not in MODEL_MAP:
345
- raise gr.Error("Please select a valid model.")
346
- if not video_path:
347
- raise gr.Error("Please upload a video.")
348
- if not text or not str(text).strip():
349
- raise gr.Error("Please enter your instruction.")
350
- if len(str(text)) > MAX_INPUT_TOKEN_LENGTH * 8:
351
- raise gr.Error("Query is too long. Please shorten your input.")
352
-
353
- processor, model = MODEL_MAP[model_name]
354
- frames = downsample_video(video_path)
355
- if not frames:
356
- raise gr.Error("Could not read the uploaded video.")
357
-
358
- messages = [
359
- {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
360
- {"role": "user", "content": [{"type": "text", "text": text}]}
361
- ]
362
-
363
- for image, timestamp in frames:
364
- messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
365
- messages[1]["content"].append({"type": "image", "image": image})
366
-
367
- inputs = processor.apply_chat_template(
368
- messages,
369
- tokenize=True,
370
- add_generation_prompt=True,
371
- return_dict=True,
372
- return_tensors="pt",
373
- truncation=True,
374
- max_length=MAX_INPUT_TOKEN_LENGTH
375
- ).to(device)
376
-
377
- streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
378
- generation_kwargs = {
379
- **inputs,
380
- "streamer": streamer,
381
- "max_new_tokens": int(max_new_tokens),
382
- "do_sample": True,
383
- "temperature": float(temperature),
384
- "top_p": float(top_p),
385
- "top_k": int(top_k),
386
- "repetition_penalty": float(repetition_penalty),
387
- }
388
-
389
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
390
- thread.start()
391
-
392
- buffer = ""
393
- for new_text in streamer:
394
- buffer += new_text.replace("<|im_end|>", "")
395
- time.sleep(0.01)
396
- yield buffer
397
-
398
- gc.collect()
399
- if torch.cuda.is_available():
400
- torch.cuda.empty_cache()
401
-
402
-
403
- def run_inference(mode, model_name, text, image_b64, video_b64, max_new_tokens_v, temperature_v, top_p_v, top_k_v, repetition_penalty_v, gpu_timeout_v):
404
- if mode == "video":
405
- temp_video_path = b64_to_temp_video(video_b64)
406
- if not temp_video_path:
407
- raise gr.Error("Could not decode uploaded video.")
408
- try:
409
- yield from generate_video(
410
- model_name=model_name,
411
- text=text,
412
- video_path=temp_video_path,
413
- max_new_tokens=max_new_tokens_v,
414
- temperature=temperature_v,
415
- top_p=top_p_v,
416
- top_k=top_k_v,
417
- repetition_penalty=repetition_penalty_v,
418
- gpu_timeout=gpu_timeout_v,
419
- )
420
- finally:
421
- try:
422
- os.remove(temp_video_path)
423
- except Exception:
424
- pass
425
- else:
426
- image = b64_to_pil(image_b64)
427
- yield from generate_image(
428
- model_name=model_name,
429
- text=text,
430
- image=image,
431
- max_new_tokens=max_new_tokens_v,
432
- temperature=temperature_v,
433
- top_p=top_p_v,
434
- top_k=top_k_v,
435
- repetition_penalty=repetition_penalty_v,
436
- gpu_timeout=gpu_timeout_v,
437
- )
438
 
439
 
440
  def noop():
@@ -500,19 +326,6 @@ footer{display:none!important}
500
  .model-tab.active{background:rgba(0,0,255,.22);border-color:#0000FF;color:#fff!important;box-shadow:0 0 0 2px rgba(0,0,255,.10)}
501
  .model-tab-label{font-size:12px;color:#ffffff!important;font-weight:600}
502
 
503
- .mode-tabs-bar{
504
- background:#18181b;border-bottom:1px solid #27272a;padding:10px 16px 12px;
505
- display:flex;gap:8px;align-items:center;flex-wrap:wrap;
506
- }
507
- .mode-tab{
508
- display:inline-flex;align-items:center;justify-content:center;gap:6px;
509
- min-width:110px;height:34px;background:transparent;border:1px solid #27272a;
510
- border-radius:999px;cursor:pointer;font-size:12px;font-weight:700;padding:0 14px;
511
- color:#ffffff!important;transition:all .15s ease;text-transform:uppercase;letter-spacing:.5px;
512
- }
513
- .mode-tab:hover{background:rgba(0,0,255,.12);border-color:rgba(0,0,255,.35)}
514
- .mode-tab.active{background:rgba(0,0,255,.22);border-color:#0000FF;color:#fff!important;box-shadow:0 0 0 2px rgba(0,0,255,.10)}
515
-
516
  .app-main-row{display:flex;gap:0;flex:1;overflow:hidden}
517
  .app-main-left{flex:1;display:flex;flex-direction:column;min-width:0;border-right:1px solid #27272a}
518
  .app-main-right{width:470px;display:flex;flex-direction:column;flex-shrink:0;background:#18181b}
@@ -548,7 +361,7 @@ footer{display:none!important}
548
  overflow:hidden;border:1px solid #27272a;background:#111114;
549
  display:flex;align-items:center;justify-content:center;position:relative;
550
  }
551
- .single-preview-card img,.single-preview-card video{
552
  width:100%;height:100%;max-width:100%;max-height:100%;
553
  object-fit:contain;display:block;background:#000;
554
  }
@@ -782,24 +595,23 @@ function init() {
782
  const fileInput = document.getElementById('custom-file-input');
783
  const previewWrap = document.getElementById('single-preview-wrap');
784
  const previewImg = document.getElementById('single-preview-img');
785
- const previewVideo = document.getElementById('single-preview-video');
786
  const btnUpload = document.getElementById('preview-upload-btn');
787
  const btnClear = document.getElementById('preview-clear-btn');
788
  const promptInput = document.getElementById('custom-query-input');
789
  const runBtnEl = document.getElementById('custom-run-btn');
790
  const outputArea = document.getElementById('custom-output-textarea');
791
  const mediaStatus = document.getElementById('sb-media-status');
792
- const exampleResultContainer = document.getElementById('example-result-data');
793
 
794
- if (!dropZone || !fileInput || !promptInput || !previewWrap || !previewImg || !previewVideo) {
795
  setTimeout(init, 250);
796
  return;
797
  }
798
 
799
  window.__visionScopeInitDone = true;
800
  let mediaState = null;
801
- let currentMode = 'image';
802
  let toastTimer = null;
 
 
803
 
804
  function showToast(message, type) {
805
  let toast = document.getElementById('app-toast');
@@ -851,6 +663,13 @@ function init() {
851
  setTimeout(() => outputArea.classList.remove('error-flash'), 800);
852
  }
853
 
 
 
 
 
 
 
 
854
  function setGradioValue(containerId, value) {
855
  const container = document.getElementById(containerId);
856
  if (!container) return;
@@ -866,10 +685,9 @@ function init() {
866
  });
867
  }
868
 
869
- function syncMediaToGradio() {
870
- setGradioValue('hidden-image-b64', mediaState && mediaState.mode === 'image' ? mediaState.b64 : '');
871
- setGradioValue('hidden-video-b64', mediaState && mediaState.mode === 'video' ? mediaState.b64 : '');
872
- const txt = mediaState ? (`1 ${mediaState.mode} uploaded`) : `No ${currentMode} uploaded`;
873
  if (mediaStatus) mediaStatus.textContent = txt;
874
  }
875
 
@@ -881,43 +699,25 @@ function init() {
881
  setGradioValue('hidden-model-name', name);
882
  }
883
 
884
- function syncModeToGradio(mode) {
885
- setGradioValue('hidden-mode-name', mode);
886
- }
887
-
888
  function renderPreview() {
889
  if (!mediaState) {
890
  previewImg.src = '';
891
- previewVideo.src = '';
892
  previewImg.style.display = 'none';
893
- previewVideo.style.display = 'none';
894
  previewWrap.style.display = 'none';
895
  if (uploadPrompt) uploadPrompt.style.display = 'flex';
896
- syncMediaToGradio();
897
  return;
898
  }
899
 
900
- if (mediaState.mode === 'video') {
901
- previewImg.src = '';
902
- previewImg.style.display = 'none';
903
- previewVideo.src = mediaState.b64;
904
- previewVideo.style.display = 'block';
905
- previewWrap.style.display = 'flex';
906
- } else {
907
- previewVideo.pause();
908
- previewVideo.removeAttribute('src');
909
- previewVideo.load();
910
- previewVideo.style.display = 'none';
911
- previewImg.src = mediaState.b64;
912
- previewImg.style.display = 'block';
913
- previewWrap.style.display = 'flex';
914
- }
915
  if (uploadPrompt) uploadPrompt.style.display = 'none';
916
- syncMediaToGradio();
917
  }
918
 
919
- function setPreview(b64, name, mode) {
920
- mediaState = {b64, name: name || 'file', mode: mode || currentMode};
921
  renderPreview();
922
  }
923
  window.__setPreview = setPreview;
@@ -930,40 +730,25 @@ function init() {
930
 
931
  function processFile(file) {
932
  if (!file) return;
933
- if (currentMode === 'image' && !file.type.startsWith('image/')) {
934
- showToast('Only image files are supported in Image mode', 'error');
935
- return;
936
- }
937
- if (currentMode === 'video' && !file.type.startsWith('video/')) {
938
- showToast('Only video files are supported in Video mode', 'error');
939
  return;
940
  }
941
  const reader = new FileReader();
942
- reader.onload = (e) => setPreview(e.target.result, file.name, currentMode);
943
  reader.readAsDataURL(file);
944
  }
945
 
 
 
 
 
946
  fileInput.addEventListener('change', (e) => {
947
  const file = e.target.files && e.target.files[0] ? e.target.files[0] : null;
948
  if (file) processFile(file);
949
  e.target.value = '';
950
  });
951
 
952
- function updateAccept() {
953
- fileInput.accept = currentMode === 'video' ? 'video/*' : 'image/*';
954
- const main = document.getElementById('upload-main-text');
955
- const sub = document.getElementById('upload-sub-text');
956
- if (main) main.textContent = currentMode === 'video' ? 'Click or drag a video here' : 'Click or drag an image here';
957
- if (sub) sub.textContent = currentMode === 'video'
958
- ? 'Upload one short video clip for multimodal video understanding'
959
- : 'Upload one document, page, receipt, screenshot, or scene image for vision tasks';
960
- if (!mediaState && mediaStatus) mediaStatus.textContent = `No ${currentMode} uploaded`;
961
- }
962
-
963
- if (uploadClick) uploadClick.addEventListener('click', () => fileInput.click());
964
- if (btnUpload) btnUpload.addEventListener('click', () => fileInput.click());
965
- if (btnClear) btnClear.addEventListener('click', clearPreview);
966
-
967
  dropZone.addEventListener('dragover', (e) => {
968
  e.preventDefault();
969
  dropZone.classList.add('drag-over');
@@ -988,28 +773,11 @@ function init() {
988
  }
989
  window.__activateModelTab = activateModelTab;
990
 
991
- function activateModeTab(mode) {
992
- currentMode = mode;
993
- document.querySelectorAll('.mode-tab[data-mode]').forEach(btn => {
994
- btn.classList.toggle('active', btn.getAttribute('data-mode') === mode);
995
- });
996
- syncModeToGradio(mode);
997
- updateAccept();
998
- if (mediaState && mediaState.mode !== mode) {
999
- clearPreview();
1000
- }
1001
- }
1002
- window.__activateModeTab = activateModeTab;
1003
-
1004
  document.querySelectorAll('.model-tab[data-model]').forEach(btn => {
1005
  btn.addEventListener('click', () => activateModelTab(btn.getAttribute('data-model')));
1006
  });
1007
- document.querySelectorAll('.mode-tab[data-mode]').forEach(btn => {
1008
- btn.addEventListener('click', () => activateModeTab(btn.getAttribute('data-mode')));
1009
- });
1010
 
1011
  activateModelTab('DeepCaption-VLA-7B');
1012
- activateModeTab('image');
1013
 
1014
  function syncSlider(customId, gradioId) {
1015
  const slider = document.getElementById(customId);
@@ -1040,16 +808,12 @@ function init() {
1040
  function validateBeforeRun() {
1041
  const promptVal = promptInput.value.trim();
1042
  if (!mediaState && !promptVal) {
1043
- showToast(`Please upload a ${currentMode} and enter your instruction`, 'error');
1044
  flashPromptError();
1045
  return false;
1046
  }
1047
  if (!mediaState) {
1048
- showToast(`Please upload a ${currentMode}`, 'error');
1049
- return false;
1050
- }
1051
- if (mediaState.mode !== currentMode) {
1052
- showToast(`Uploaded media does not match ${currentMode} mode`, 'error');
1053
  return false;
1054
  }
1055
  if (!promptVal) {
@@ -1068,11 +832,9 @@ function init() {
1068
  window.__clickGradioRunBtn = function() {
1069
  if (!validateBeforeRun()) return;
1070
  syncPromptToGradio();
1071
- syncMediaToGradio();
1072
  const activeModel = document.querySelector('.model-tab.active');
1073
  if (activeModel) syncModelToGradio(activeModel.getAttribute('data-model'));
1074
- const activeMode = document.querySelector('.mode-tab.active');
1075
- if (activeMode) syncModeToGradio(activeMode.getAttribute('data-mode'));
1076
  if (outputArea) outputArea.value = '';
1077
  showLoader();
1078
  setTimeout(() => {
@@ -1126,55 +888,86 @@ function init() {
1126
  });
1127
  }
1128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1129
  document.querySelectorAll('.example-card[data-idx]').forEach(card => {
1130
  card.addEventListener('click', () => {
1131
  const idx = card.getAttribute('data-idx');
1132
  document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
1133
  card.classList.add('loading');
1134
  showToast('Loading example...', 'info');
 
1135
  setGradioValue('example-result-data', '');
1136
  setGradioValue('example-idx-input', idx);
 
1137
  setTimeout(() => {
1138
  const btn = document.getElementById('example-load-btn');
1139
  if (btn) {
1140
  const b = btn.querySelector('button');
1141
  if (b) b.click(); else btn.click();
1142
  }
1143
- }, 150);
1144
- setTimeout(() => card.classList.remove('loading'), 12000);
1145
  });
1146
  });
1147
 
1148
- function checkExampleResult() {
1149
- if (!exampleResultContainer) return;
1150
- const el = exampleResultContainer.querySelector('textarea') || exampleResultContainer.querySelector('input');
1151
- if (!el || !el.value) return;
1152
- if (window.__lastExampleVal === el.value) return;
1153
- try {
1154
- const data = JSON.parse(el.value);
1155
- if (data.status === 'ok') {
1156
- window.__lastExampleVal = el.value;
1157
- if (data.mode) activateModeTab(data.mode);
1158
- if (data.media) setPreview(data.media, data.name || 'example', data.mode || 'image');
1159
- if (data.query) {
1160
- promptInput.value = data.query;
1161
- syncPromptToGradio();
1162
  }
1163
- if (data.model) activateModelTab(data.model);
1164
- document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
1165
- showToast('Example loaded', 'info');
1166
- } else if (data.status === 'error') {
1167
- document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
1168
- showToast(data.message || 'Failed to load example', 'error');
1169
  }
1170
- } catch(e) {}
1171
- }
1172
-
1173
- const obsExample = new MutationObserver(checkExampleResult);
1174
- if (exampleResultContainer) {
1175
- obsExample.observe(exampleResultContainer, {childList:true, subtree:true, characterData:true, attributes:true});
1176
  }
1177
- setInterval(checkExampleResult, 500);
1178
 
1179
  if (outputArea) outputArea.value = '';
1180
  const sb = document.getElementById('sb-run-state');
@@ -1236,15 +1029,8 @@ MODEL_TABS_HTML = "".join([
1236
  for m in MODEL_CHOICES
1237
  ])
1238
 
1239
- MODE_TABS_HTML = """
1240
- <button class="mode-tab active" data-mode="image">Image Inference</button>
1241
- <button class="mode-tab" data-mode="video">Video Inference</button>
1242
- """
1243
-
1244
  with gr.Blocks() as demo:
1245
- hidden_mode_name = gr.Textbox(value="image", elem_id="hidden-mode-name", elem_classes="hidden-input", container=False)
1246
  hidden_image_b64 = gr.Textbox(value="", elem_id="hidden-image-b64", elem_classes="hidden-input", container=False)
1247
- hidden_video_b64 = gr.Textbox(value="", elem_id="hidden-video-b64", elem_classes="hidden-input", container=False)
1248
  prompt = gr.Textbox(value="", elem_id="prompt-gradio-input", elem_classes="hidden-input", container=False)
1249
  hidden_model_name = gr.Textbox(value="DeepCaption-VLA-7B", elem_id="hidden-model-name", elem_classes="hidden-input", container=False)
1250
 
@@ -1268,7 +1054,7 @@ with gr.Blocks() as demo:
1268
  <div class="app-logo">{VISION_LOGO_SVG}</div>
1269
  <span class="app-title">VisionScope R2</span>
1270
  <span class="app-badge">vision enabled</span>
1271
- <span class="app-badge fast">Image + Video</span>
1272
  </div>
1273
  </div>
1274
 
@@ -1276,10 +1062,6 @@ with gr.Blocks() as demo:
1276
  {MODEL_TABS_HTML}
1277
  </div>
1278
 
1279
- <div class="mode-tabs-bar">
1280
- {MODE_TABS_HTML}
1281
- </div>
1282
-
1283
  <div class="app-main-row">
1284
  <div class="app-main-left">
1285
  <div id="media-drop-zone">
@@ -1296,7 +1078,6 @@ with gr.Blocks() as demo:
1296
  <div id="single-preview-wrap" class="single-preview-wrap">
1297
  <div class="single-preview-card">
1298
  <img id="single-preview-img" src="" alt="Preview" style="display:none;">
1299
- <video id="single-preview-video" controls playsinline style="display:none;"></video>
1300
  <div class="preview-overlay-actions">
1301
  <button id="preview-upload-btn" class="preview-action-btn" title="Replace">Upload</button>
1302
  <button id="preview-clear-btn" class="preview-action-btn" title="Clear">Clear</button>
@@ -1306,10 +1087,9 @@ with gr.Blocks() as demo:
1306
  </div>
1307
 
1308
  <div class="hint-bar">
1309
- <b>Upload:</b> Click or drag media into the panel &nbsp;&middot;&nbsp;
1310
- <b>Mode:</b> Switch between image and video inference &nbsp;&middot;&nbsp;
1311
  <b>Model:</b> Change models from the header &nbsp;&middot;&nbsp;
1312
- <kbd>Clear</kbd> removes the current media
1313
  </div>
1314
 
1315
  <div class="examples-section">
@@ -1325,7 +1105,7 @@ with gr.Blocks() as demo:
1325
  <div class="panel-card-title">Vision Instruction</div>
1326
  <div class="panel-card-body">
1327
  <label class="modern-label" for="custom-query-input">Query Input</label>
1328
- <textarea id="custom-query-input" class="modern-textarea" rows="4" placeholder="e.g., describe the scene, read the handwriting, explain the video, summarize frames, extract visible text, estimate distance..."></textarea>
1329
  </div>
1330
  </div>
1331
 
@@ -1412,11 +1192,9 @@ with gr.Blocks() as demo:
1412
  run_btn.click(
1413
  fn=run_inference,
1414
  inputs=[
1415
- hidden_mode_name,
1416
  hidden_model_name,
1417
  prompt,
1418
  hidden_image_b64,
1419
- hidden_video_b64,
1420
  max_new_tokens,
1421
  temperature,
1422
  top_p,
@@ -1425,30 +1203,20 @@ with gr.Blocks() as demo:
1425
  gpu_duration_state,
1426
  ],
1427
  outputs=[result],
1428
- js=r"""(mode, model, p, img, vid, mnt, t, tp, tk, rp, gd) => {
1429
  const modelEl = document.querySelector('.model-tab.active');
1430
- const modeEl = document.querySelector('.mode-tab.active');
1431
  const modelVal = modelEl ? modelEl.getAttribute('data-model') : model;
1432
- const modeVal = modeEl ? modeEl.getAttribute('data-mode') : mode;
1433
  const promptEl = document.getElementById('custom-query-input');
1434
  const promptVal = promptEl ? promptEl.value : p;
1435
 
1436
  let imgVal = img;
1437
- let vidVal = vid;
1438
-
1439
  const imgContainer = document.getElementById('hidden-image-b64');
1440
- const vidContainer = document.getElementById('hidden-video-b64');
1441
-
1442
  if (imgContainer) {
1443
  const inner = imgContainer.querySelector('textarea, input');
1444
  if (inner) imgVal = inner.value;
1445
  }
1446
- if (vidContainer) {
1447
- const inner = vidContainer.querySelector('textarea, input');
1448
- if (inner) vidVal = inner.value;
1449
- }
1450
 
1451
- return [modeVal, modelVal, promptVal, imgVal, vidVal, mnt, t, tp, tk, rp, gd];
1452
  }""",
1453
  )
1454
 
@@ -1465,5 +1233,5 @@ if __name__ == "__main__":
1465
  mcp_server=True,
1466
  ssr_mode=False,
1467
  show_error=True,
1468
- allowed_paths=["images", "videos"],
1469
  )
 
1
  import os
2
  import gc
 
3
  import json
 
4
  import time
5
  import base64
 
6
  from io import BytesIO
7
  from threading import Thread
8
 
9
  import gradio as gr
10
  import spaces
11
  import torch
12
+ from PIL import Image
 
 
13
 
14
  from transformers import (
15
  Qwen2_5_VLForConditionalGeneration,
 
25
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
26
  print("Using device:", device)
27
 
 
 
 
 
 
 
 
 
 
 
28
  MODEL_ID_N = "prithivMLmods/DeepCaption-VLA-7B"
29
  processor_n = AutoProcessor.from_pretrained(MODEL_ID_N, trust_remote_code=True)
30
  model_n = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 
81
  MODEL_CHOICES = list(MODEL_MAP.keys())
82
 
83
  image_examples = [
84
+ {"query": "type out the messy hand-writing as accurately as you can.", "media": "images/1.jpg", "model": "coreOCR-7B-050325-preview"},
85
+ {"query": "count the number of birds and explain the scene in detail.", "media": "images/2.jpeg", "model": "DeepCaption-VLA-7B"},
86
+ {"query": "how far is the Goal from the penalty taker in this image?.", "media": "images/3.png", "model": "SpaceThinker-3B"},
87
+ {"query": "approximately how many meters apart are the chair and bookshelf?.", "media": "images/4.png", "model": "SkyCaptioner-V1"},
88
+ {"query": "how far is the man in the red hat from the pallet of boxes in feet?.", "media": "images/5.jpg", "model": "SpaceOm-3B"},
 
 
 
 
 
89
  ]
90
 
 
 
91
 
92
  def pil_to_data_url(img: Image.Image, fmt="PNG"):
93
  buf = BytesIO()
 
106
  "jpeg": "image/jpeg",
107
  "png": "image/png",
108
  "webp": "image/webp",
109
+ }.get(ext, "image/jpeg")
 
 
 
110
  with open(path, "rb") as f:
111
  data = base64.b64encode(f.read()).decode()
112
  return f"data:{mime};base64,{data}"
113
 
114
 
115
+ def make_thumb_b64(path, max_dim=240):
116
  try:
117
+ img = Image.open(path).convert("RGB")
 
 
 
 
 
 
 
 
 
118
  img.thumbnail((max_dim, max_dim))
119
  return pil_to_data_url(img, "JPEG")
120
  except Exception as e:
 
124
 
125
  def build_example_cards_html():
126
  cards = ""
127
+ for i, ex in enumerate(image_examples):
128
+ thumb = make_thumb_b64(ex["media"])
129
  prompt_short = ex["query"][:72] + ("..." if len(ex["query"]) > 72 else "")
 
130
  cards += f"""
131
  <div class="example-card" data-idx="{i}">
132
  <div class="example-thumb-wrap">
133
  {"<img src='" + thumb + "' alt=''>" if thumb else "<div class='example-thumb-placeholder'>Preview</div>"}
134
+ <div class="example-media-chip">IMAGE</div>
135
  </div>
136
  <div class="example-meta-row">
137
  <span class="example-badge">{ex["model"]}</span>
 
150
  idx = int(float(idx_str))
151
  except Exception:
152
  return json.dumps({"status": "error", "message": "Invalid example index"})
153
+ if idx < 0 or idx >= len(image_examples):
154
  return json.dumps({"status": "error", "message": "Example index out of range"})
155
+ ex = image_examples[idx]
156
  media_b64 = file_to_data_url(ex["media"])
157
  if not media_b64:
158
+ return json.dumps({"status": "error", "message": "Could not load example image"})
159
  return json.dumps({
160
  "status": "ok",
161
  "query": ex["query"],
162
  "media": media_b64,
163
  "model": ex["model"],
 
164
  "name": os.path.basename(ex["media"]),
165
  })
166
 
 
179
  return None
180
 
181
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  def calc_timeout_image(model_name, text, image, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_timeout):
183
  try:
184
  return int(gpu_timeout)
 
186
  return 60
187
 
188
 
 
 
 
 
 
 
 
189
  @spaces.GPU(duration=calc_timeout_image)
190
  def generate_image(model_name, text, image, max_new_tokens=1024, temperature=0.6, top_p=0.9, top_k=50, repetition_penalty=1.2, gpu_timeout=60):
191
  if not model_name or model_name not in MODEL_MAP:
 
248
  torch.cuda.empty_cache()
249
 
250
 
251
+ def run_inference(model_name, text, image_b64, max_new_tokens_v, temperature_v, top_p_v, top_k_v, repetition_penalty_v, gpu_timeout_v):
252
+ image = b64_to_pil(image_b64)
253
+ yield from generate_image(
254
+ model_name=model_name,
255
+ text=text,
256
+ image=image,
257
+ max_new_tokens=max_new_tokens_v,
258
+ temperature=temperature_v,
259
+ top_p=top_p_v,
260
+ top_k=top_k_v,
261
+ repetition_penalty=repetition_penalty_v,
262
+ gpu_timeout=gpu_timeout_v,
263
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
 
265
 
266
  def noop():
 
326
  .model-tab.active{background:rgba(0,0,255,.22);border-color:#0000FF;color:#fff!important;box-shadow:0 0 0 2px rgba(0,0,255,.10)}
327
  .model-tab-label{font-size:12px;color:#ffffff!important;font-weight:600}
328
 
 
 
 
 
 
 
 
 
 
 
 
 
 
329
  .app-main-row{display:flex;gap:0;flex:1;overflow:hidden}
330
  .app-main-left{flex:1;display:flex;flex-direction:column;min-width:0;border-right:1px solid #27272a}
331
  .app-main-right{width:470px;display:flex;flex-direction:column;flex-shrink:0;background:#18181b}
 
361
  overflow:hidden;border:1px solid #27272a;background:#111114;
362
  display:flex;align-items:center;justify-content:center;position:relative;
363
  }
364
+ .single-preview-card img{
365
  width:100%;height:100%;max-width:100%;max-height:100%;
366
  object-fit:contain;display:block;background:#000;
367
  }
 
595
  const fileInput = document.getElementById('custom-file-input');
596
  const previewWrap = document.getElementById('single-preview-wrap');
597
  const previewImg = document.getElementById('single-preview-img');
 
598
  const btnUpload = document.getElementById('preview-upload-btn');
599
  const btnClear = document.getElementById('preview-clear-btn');
600
  const promptInput = document.getElementById('custom-query-input');
601
  const runBtnEl = document.getElementById('custom-run-btn');
602
  const outputArea = document.getElementById('custom-output-textarea');
603
  const mediaStatus = document.getElementById('sb-media-status');
 
604
 
605
+ if (!dropZone || !fileInput || !promptInput || !previewWrap || !previewImg) {
606
  setTimeout(init, 250);
607
  return;
608
  }
609
 
610
  window.__visionScopeInitDone = true;
611
  let mediaState = null;
 
612
  let toastTimer = null;
613
+ let examplePoller = null;
614
+ let lastSeenExamplePayload = null;
615
 
616
  function showToast(message, type) {
617
  let toast = document.getElementById('app-toast');
 
663
  setTimeout(() => outputArea.classList.remove('error-flash'), 800);
664
  }
665
 
666
+ function getValueFromContainer(containerId) {
667
+ const container = document.getElementById(containerId);
668
+ if (!container) return '';
669
+ const el = container.querySelector('textarea, input');
670
+ return el ? (el.value || '') : '';
671
+ }
672
+
673
  function setGradioValue(containerId, value) {
674
  const container = document.getElementById(containerId);
675
  if (!container) return;
 
685
  });
686
  }
687
 
688
+ function syncImageToGradio() {
689
+ setGradioValue('hidden-image-b64', mediaState ? mediaState.b64 : '');
690
+ const txt = mediaState ? '1 image uploaded' : 'No image uploaded';
 
691
  if (mediaStatus) mediaStatus.textContent = txt;
692
  }
693
 
 
699
  setGradioValue('hidden-model-name', name);
700
  }
701
 
 
 
 
 
702
  function renderPreview() {
703
  if (!mediaState) {
704
  previewImg.src = '';
 
705
  previewImg.style.display = 'none';
 
706
  previewWrap.style.display = 'none';
707
  if (uploadPrompt) uploadPrompt.style.display = 'flex';
708
+ syncImageToGradio();
709
  return;
710
  }
711
 
712
+ previewImg.src = mediaState.b64;
713
+ previewImg.style.display = 'block';
714
+ previewWrap.style.display = 'flex';
 
 
 
 
 
 
 
 
 
 
 
 
715
  if (uploadPrompt) uploadPrompt.style.display = 'none';
716
+ syncImageToGradio();
717
  }
718
 
719
+ function setPreview(b64, name) {
720
+ mediaState = {b64, name: name || 'file'};
721
  renderPreview();
722
  }
723
  window.__setPreview = setPreview;
 
730
 
731
  function processFile(file) {
732
  if (!file) return;
733
+ if (!file.type.startsWith('image/')) {
734
+ showToast('Only image files are supported', 'error');
 
 
 
 
735
  return;
736
  }
737
  const reader = new FileReader();
738
+ reader.onload = (e) => setPreview(e.target.result, file.name);
739
  reader.readAsDataURL(file);
740
  }
741
 
742
+ if (uploadClick) uploadClick.addEventListener('click', () => fileInput.click());
743
+ if (btnUpload) btnUpload.addEventListener('click', () => fileInput.click());
744
+ if (btnClear) btnClear.addEventListener('click', clearPreview);
745
+
746
  fileInput.addEventListener('change', (e) => {
747
  const file = e.target.files && e.target.files[0] ? e.target.files[0] : null;
748
  if (file) processFile(file);
749
  e.target.value = '';
750
  });
751
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
752
  dropZone.addEventListener('dragover', (e) => {
753
  e.preventDefault();
754
  dropZone.classList.add('drag-over');
 
773
  }
774
  window.__activateModelTab = activateModelTab;
775
 
 
 
 
 
 
 
 
 
 
 
 
 
 
776
  document.querySelectorAll('.model-tab[data-model]').forEach(btn => {
777
  btn.addEventListener('click', () => activateModelTab(btn.getAttribute('data-model')));
778
  });
 
 
 
779
 
780
  activateModelTab('DeepCaption-VLA-7B');
 
781
 
782
  function syncSlider(customId, gradioId) {
783
  const slider = document.getElementById(customId);
 
808
  function validateBeforeRun() {
809
  const promptVal = promptInput.value.trim();
810
  if (!mediaState && !promptVal) {
811
+ showToast('Please upload an image and enter your instruction', 'error');
812
  flashPromptError();
813
  return false;
814
  }
815
  if (!mediaState) {
816
+ showToast('Please upload an image', 'error');
 
 
 
 
817
  return false;
818
  }
819
  if (!promptVal) {
 
832
  window.__clickGradioRunBtn = function() {
833
  if (!validateBeforeRun()) return;
834
  syncPromptToGradio();
835
+ syncImageToGradio();
836
  const activeModel = document.querySelector('.model-tab.active');
837
  if (activeModel) syncModelToGradio(activeModel.getAttribute('data-model'));
 
 
838
  if (outputArea) outputArea.value = '';
839
  showLoader();
840
  setTimeout(() => {
 
888
  });
889
  }
890
 
891
+ function applyExamplePayload(raw) {
892
+ try {
893
+ const data = JSON.parse(raw);
894
+ if (data.status === 'ok') {
895
+ if (data.media) setPreview(data.media, data.name || 'example_file');
896
+ if (data.query) {
897
+ promptInput.value = data.query;
898
+ syncPromptToGradio();
899
+ }
900
+ if (data.model) activateModelTab(data.model);
901
+ document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
902
+ showToast('Example loaded', 'info');
903
+ } else if (data.status === 'error') {
904
+ document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
905
+ showToast(data.message || 'Failed to load example', 'error');
906
+ }
907
+ } catch (e) {
908
+ document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
909
+ showToast('Failed to parse example data', 'error');
910
+ }
911
+ }
912
+
913
+ function startExamplePolling() {
914
+ if (examplePoller) clearInterval(examplePoller);
915
+ let attempts = 0;
916
+ examplePoller = setInterval(() => {
917
+ attempts += 1;
918
+ const current = getValueFromContainer('example-result-data');
919
+ if (current && current !== lastSeenExamplePayload) {
920
+ lastSeenExamplePayload = current;
921
+ clearInterval(examplePoller);
922
+ examplePoller = null;
923
+ applyExamplePayload(current);
924
+ return;
925
+ }
926
+ if (attempts >= 80) {
927
+ clearInterval(examplePoller);
928
+ examplePoller = null;
929
+ document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
930
+ showToast('Example load timed out', 'error');
931
+ }
932
+ }, 150);
933
+ }
934
+
935
  document.querySelectorAll('.example-card[data-idx]').forEach(card => {
936
  card.addEventListener('click', () => {
937
  const idx = card.getAttribute('data-idx');
938
  document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
939
  card.classList.add('loading');
940
  showToast('Loading example...', 'info');
941
+
942
  setGradioValue('example-result-data', '');
943
  setGradioValue('example-idx-input', idx);
944
+
945
  setTimeout(() => {
946
  const btn = document.getElementById('example-load-btn');
947
  if (btn) {
948
  const b = btn.querySelector('button');
949
  if (b) b.click(); else btn.click();
950
  }
951
+ startExamplePolling();
952
+ }, 220);
953
  });
954
  });
955
 
956
+ const observerTarget = document.getElementById('example-result-data');
957
+ if (observerTarget) {
958
+ const obs = new MutationObserver(() => {
959
+ const current = getValueFromContainer('example-result-data');
960
+ if (current && current !== lastSeenExamplePayload) {
961
+ lastSeenExamplePayload = current;
962
+ if (examplePoller) {
963
+ clearInterval(examplePoller);
964
+ examplePoller = null;
 
 
 
 
 
965
  }
966
+ applyExamplePayload(current);
 
 
 
 
 
967
  }
968
+ });
969
+ obs.observe(observerTarget, {childList:true, subtree:true, characterData:true, attributes:true});
 
 
 
 
970
  }
 
971
 
972
  if (outputArea) outputArea.value = '';
973
  const sb = document.getElementById('sb-run-state');
 
1029
  for m in MODEL_CHOICES
1030
  ])
1031
 
 
 
 
 
 
1032
  with gr.Blocks() as demo:
 
1033
  hidden_image_b64 = gr.Textbox(value="", elem_id="hidden-image-b64", elem_classes="hidden-input", container=False)
 
1034
  prompt = gr.Textbox(value="", elem_id="prompt-gradio-input", elem_classes="hidden-input", container=False)
1035
  hidden_model_name = gr.Textbox(value="DeepCaption-VLA-7B", elem_id="hidden-model-name", elem_classes="hidden-input", container=False)
1036
 
 
1054
  <div class="app-logo">{VISION_LOGO_SVG}</div>
1055
  <span class="app-title">VisionScope R2</span>
1056
  <span class="app-badge">vision enabled</span>
1057
+ <span class="app-badge fast">Image Inference</span>
1058
  </div>
1059
  </div>
1060
 
 
1062
  {MODEL_TABS_HTML}
1063
  </div>
1064
 
 
 
 
 
1065
  <div class="app-main-row">
1066
  <div class="app-main-left">
1067
  <div id="media-drop-zone">
 
1078
  <div id="single-preview-wrap" class="single-preview-wrap">
1079
  <div class="single-preview-card">
1080
  <img id="single-preview-img" src="" alt="Preview" style="display:none;">
 
1081
  <div class="preview-overlay-actions">
1082
  <button id="preview-upload-btn" class="preview-action-btn" title="Replace">Upload</button>
1083
  <button id="preview-clear-btn" class="preview-action-btn" title="Clear">Clear</button>
 
1087
  </div>
1088
 
1089
  <div class="hint-bar">
1090
+ <b>Upload:</b> Click or drag an image into the panel &nbsp;&middot;&nbsp;
 
1091
  <b>Model:</b> Change models from the header &nbsp;&middot;&nbsp;
1092
+ <kbd>Clear</kbd> removes the current image
1093
  </div>
1094
 
1095
  <div class="examples-section">
 
1105
  <div class="panel-card-title">Vision Instruction</div>
1106
  <div class="panel-card-body">
1107
  <label class="modern-label" for="custom-query-input">Query Input</label>
1108
+ <textarea id="custom-query-input" class="modern-textarea" rows="4" placeholder="e.g., describe the scene, read the handwriting, extract visible text, estimate distance..."></textarea>
1109
  </div>
1110
  </div>
1111
 
 
1192
  run_btn.click(
1193
  fn=run_inference,
1194
  inputs=[
 
1195
  hidden_model_name,
1196
  prompt,
1197
  hidden_image_b64,
 
1198
  max_new_tokens,
1199
  temperature,
1200
  top_p,
 
1203
  gpu_duration_state,
1204
  ],
1205
  outputs=[result],
1206
+ js=r"""(model, p, img, mnt, t, tp, tk, rp, gd) => {
1207
  const modelEl = document.querySelector('.model-tab.active');
 
1208
  const modelVal = modelEl ? modelEl.getAttribute('data-model') : model;
 
1209
  const promptEl = document.getElementById('custom-query-input');
1210
  const promptVal = promptEl ? promptEl.value : p;
1211
 
1212
  let imgVal = img;
 
 
1213
  const imgContainer = document.getElementById('hidden-image-b64');
 
 
1214
  if (imgContainer) {
1215
  const inner = imgContainer.querySelector('textarea, input');
1216
  if (inner) imgVal = inner.value;
1217
  }
 
 
 
 
1218
 
1219
+ return [modelVal, promptVal, imgVal, mnt, t, tp, tk, rp, gd];
1220
  }""",
1221
  )
1222
 
 
1233
  mcp_server=True,
1234
  ssr_mode=False,
1235
  show_error=True,
1236
+ allowed_paths=["images"],
1237
  )