Spaces:

prithivMLmods
/

VisionScope-R2

Sleeping

App Files Files Community

prithivMLmods commited on Mar 22

Commit

eb6d2ca

verified ·

1 Parent(s): 7a98807

update app

Browse files

Files changed (1) hide show

app.py +126 -358

app.py CHANGED Viewed

@@ -1,20 +1,15 @@
 import os
 import gc
-import re
 import json
-import uuid
 import time
 import base64
-import random
 from io import BytesIO
 from threading import Thread
 import gradio as gr
 import spaces
 import torch
-import numpy as np
-from PIL import Image, ImageOps
-import cv2
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
@@ -30,16 +25,6 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 print("Using device:", device)
-def load_model(model_id, cls, **kwargs):
-    return cls.from_pretrained(
-        model_id,
-        trust_remote_code=True,
-        torch_dtype=torch.float16,
-        **kwargs
-    ).to(device).eval()
 MODEL_ID_N = "prithivMLmods/DeepCaption-VLA-7B"
 processor_n = AutoProcessor.from_pretrained(MODEL_ID_N, trust_remote_code=True)
 model_n = Qwen2_5_VLForConditionalGeneration.from_pretrained(
@@ -96,20 +81,13 @@ MODEL_MAP = {
 MODEL_CHOICES = list(MODEL_MAP.keys())
 image_examples = [
-    {"query": "type out the messy hand-writing as accurately as you can.", "media": "images/1.jpg", "model": "coreOCR-7B-050325-preview", "mode": "image"},
-    {"query": "count the number of birds and explain the scene in detail.", "media": "images/2.jpeg", "model": "DeepCaption-VLA-7B", "mode": "image"},
-    {"query": "how far is the Goal from the penalty taker in this image?.", "media": "images/3.png", "model": "SpaceThinker-3B", "mode": "image"},
-    {"query": "approximately how many meters apart are the chair and bookshelf?.", "media": "images/4.png", "model": "SkyCaptioner-V1", "mode": "image"},
-    {"query": "how far is the man in the red hat from the pallet of boxes in feet?.", "media": "images/5.jpg", "model": "SpaceOm-3B", "mode": "image"},
-]
-video_examples = [
-    {"query": "give the highlights of the movie scene video.", "media": "videos/1.mp4", "model": "DeepCaption-VLA-7B", "mode": "video"},
-    {"query": "explain the advertisement in detail.", "media": "videos/2.mp4", "model": "SkyCaptioner-V1", "mode": "video"},
 ]
-all_examples = image_examples + video_examples
 def pil_to_data_url(img: Image.Image, fmt="PNG"):
     buf = BytesIO()
@@ -128,27 +106,15 @@ def file_to_data_url(path):
         "jpeg": "image/jpeg",
         "png": "image/png",
         "webp": "image/webp",
-        "mp4": "video/mp4",
-        "mov": "video/quicktime",
-        "webm": "video/webm",
-    }.get(ext, "application/octet-stream")
     with open(path, "rb") as f:
         data = base64.b64encode(f.read()).decode()
     return f"data:{mime};base64,{data}"
-def make_thumb_b64(path, mode="image", max_dim=240):
     try:
-        if mode == "video":
-            cap = cv2.VideoCapture(path)
-            ok, frame = cap.read()
-            cap.release()
-            if not ok:
-                return ""
-            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-            img = Image.fromarray(frame).convert("RGB")
-        else:
-            img = Image.open(path).convert("RGB")
         img.thumbnail((max_dim, max_dim))
         return pil_to_data_url(img, "JPEG")
     except Exception as e:
@@ -158,15 +124,14 @@ def make_thumb_b64(path, mode="image", max_dim=240):
 def build_example_cards_html():
     cards = ""
-    for i, ex in enumerate(all_examples):
-        thumb = make_thumb_b64(ex["media"], ex["mode"])
         prompt_short = ex["query"][:72] + ("..." if len(ex["query"]) > 72 else "")
-        media_badge = "VIDEO" if ex["mode"] == "video" else "IMAGE"
         cards += f"""
         <div class="example-card" data-idx="{i}">
             <div class="example-thumb-wrap">
                 {"<img src='" + thumb + "' alt=''>" if thumb else "<div class='example-thumb-placeholder'>Preview</div>"}
-                <div class="example-media-chip">{media_badge}</div>
             </div>
             <div class="example-meta-row">
                 <span class="example-badge">{ex["model"]}</span>
@@ -185,18 +150,17 @@ def load_example_data(idx_str):
         idx = int(float(idx_str))
     except Exception:
         return json.dumps({"status": "error", "message": "Invalid example index"})
-    if idx < 0 or idx >= len(all_examples):
         return json.dumps({"status": "error", "message": "Example index out of range"})
-    ex = all_examples[idx]
     media_b64 = file_to_data_url(ex["media"])
     if not media_b64:
-        return json.dumps({"status": "error", "message": f"Could not load example {ex['mode']}"})
     return json.dumps({
         "status": "ok",
         "query": ex["query"],
         "media": media_b64,
         "model": ex["model"],
-        "mode": ex["mode"],
         "name": os.path.basename(ex["media"]),
     })
@@ -215,54 +179,6 @@ def b64_to_pil(b64_str):
         return None
-def b64_to_temp_video(b64_str):
-    if not b64_str:
-        return None
-    try:
-        if b64_str.startswith("data:"):
-            header, data = b64_str.split(",", 1)
-            mime = header.split(";")[0].replace("data:", "")
-        else:
-            data = b64_str
-            mime = "video/mp4"
-        ext = {
-            "video/mp4": ".mp4",
-            "video/webm": ".webm",
-            "video/quicktime": ".mov",
-        }.get(mime, ".mp4")
-        raw = base64.b64decode(data)
-        temp_dir = os.path.join("/tmp", "visionscope_r2_media")
-        os.makedirs(temp_dir, exist_ok=True)
-        path = os.path.join(temp_dir, f"{uuid.uuid4().hex}{ext}")
-        with open(path, "wb") as f:
-            f.write(raw)
-        return path
-    except Exception:
-        return None
-def downsample_video(video_path):
-    vidcap = cv2.VideoCapture(video_path)
-    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
-    fps = vidcap.get(cv2.CAP_PROP_FPS) or 1.0
-    frames = []
-    frame_count = min(total_frames, 10) if total_frames > 0 else 0
-    if frame_count == 0:
-        vidcap.release()
-        return frames
-    frame_indices = np.linspace(0, total_frames - 1, frame_count, dtype=int)
-    for i in frame_indices:
-        vidcap.set(cv2.CAP_PROP_POS_FRAMES, int(i))
-        success, image = vidcap.read()
-        if success:
-            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-            pil_image = Image.fromarray(image)
-            timestamp = round(float(i) / float(fps), 2)
-            frames.append((pil_image, timestamp))
-    vidcap.release()
-    return frames
 def calc_timeout_image(model_name, text, image, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_timeout):
     try:
         return int(gpu_timeout)
@@ -270,13 +186,6 @@ def calc_timeout_image(model_name, text, image, max_new_tokens, temperature, top
         return 60
-def calc_timeout_video(model_name, text, video_path, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_timeout):
-    try:
-        return int(gpu_timeout)
-    except Exception:
-        return 60
 @spaces.GPU(duration=calc_timeout_image)
 def generate_image(model_name, text, image, max_new_tokens=1024, temperature=0.6, top_p=0.9, top_k=50, repetition_penalty=1.2, gpu_timeout=60):
     if not model_name or model_name not in MODEL_MAP:
@@ -339,102 +248,19 @@ def generate_image(model_name, text, image, max_new_tokens=1024, temperature=0.6
         torch.cuda.empty_cache()
-@spaces.GPU(duration=calc_timeout_video)
-def generate_video(model_name, text, video_path, max_new_tokens=1024, temperature=0.6, top_p=0.9, top_k=50, repetition_penalty=1.2, gpu_timeout=90):
-    if not model_name or model_name not in MODEL_MAP:
-        raise gr.Error("Please select a valid model.")
-    if not video_path:
-        raise gr.Error("Please upload a video.")
-    if not text or not str(text).strip():
-        raise gr.Error("Please enter your instruction.")
-    if len(str(text)) > MAX_INPUT_TOKEN_LENGTH * 8:
-        raise gr.Error("Query is too long. Please shorten your input.")
-    processor, model = MODEL_MAP[model_name]
-    frames = downsample_video(video_path)
-    if not frames:
-        raise gr.Error("Could not read the uploaded video.")
-    messages = [
-        {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
-        {"role": "user", "content": [{"type": "text", "text": text}]}
-    ]
-    for image, timestamp in frames:
-        messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
-        messages[1]["content"].append({"type": "image", "image": image})
-    inputs = processor.apply_chat_template(
-        messages,
-        tokenize=True,
-        add_generation_prompt=True,
-        return_dict=True,
-        return_tensors="pt",
-        truncation=True,
-        max_length=MAX_INPUT_TOKEN_LENGTH
-    ).to(device)
-    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = {
-        **inputs,
-        "streamer": streamer,
-        "max_new_tokens": int(max_new_tokens),
-        "do_sample": True,
-        "temperature": float(temperature),
-        "top_p": float(top_p),
-        "top_k": int(top_k),
-        "repetition_penalty": float(repetition_penalty),
-    }
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
-    thread.start()
-    buffer = ""
-    for new_text in streamer:
-        buffer += new_text.replace("<|im_end|>", "")
-        time.sleep(0.01)
-        yield buffer
-    gc.collect()
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-def run_inference(mode, model_name, text, image_b64, video_b64, max_new_tokens_v, temperature_v, top_p_v, top_k_v, repetition_penalty_v, gpu_timeout_v):
-    if mode == "video":
-        temp_video_path = b64_to_temp_video(video_b64)
-        if not temp_video_path:
-            raise gr.Error("Could not decode uploaded video.")
-        try:
-            yield from generate_video(
-                model_name=model_name,
-                text=text,
-                video_path=temp_video_path,
-                max_new_tokens=max_new_tokens_v,
-                temperature=temperature_v,
-                top_p=top_p_v,
-                top_k=top_k_v,
-                repetition_penalty=repetition_penalty_v,
-                gpu_timeout=gpu_timeout_v,
-            )
-        finally:
-            try:
-                os.remove(temp_video_path)
-            except Exception:
-                pass
-    else:
-        image = b64_to_pil(image_b64)
-        yield from generate_image(
-            model_name=model_name,
-            text=text,
-            image=image,
-            max_new_tokens=max_new_tokens_v,
-            temperature=temperature_v,
-            top_p=top_p_v,
-            top_k=top_k_v,
-            repetition_penalty=repetition_penalty_v,
-            gpu_timeout=gpu_timeout_v,
-        )
 def noop():
@@ -500,19 +326,6 @@ footer{display:none!important}
 .model-tab.active{background:rgba(0,0,255,.22);border-color:#0000FF;color:#fff!important;box-shadow:0 0 0 2px rgba(0,0,255,.10)}
 .model-tab-label{font-size:12px;color:#ffffff!important;font-weight:600}
-.mode-tabs-bar{
-    background:#18181b;border-bottom:1px solid #27272a;padding:10px 16px 12px;
-    display:flex;gap:8px;align-items:center;flex-wrap:wrap;
-}
-.mode-tab{
-    display:inline-flex;align-items:center;justify-content:center;gap:6px;
-    min-width:110px;height:34px;background:transparent;border:1px solid #27272a;
-    border-radius:999px;cursor:pointer;font-size:12px;font-weight:700;padding:0 14px;
-    color:#ffffff!important;transition:all .15s ease;text-transform:uppercase;letter-spacing:.5px;
-}
-.mode-tab:hover{background:rgba(0,0,255,.12);border-color:rgba(0,0,255,.35)}
-.mode-tab.active{background:rgba(0,0,255,.22);border-color:#0000FF;color:#fff!important;box-shadow:0 0 0 2px rgba(0,0,255,.10)}
 .app-main-row{display:flex;gap:0;flex:1;overflow:hidden}
 .app-main-left{flex:1;display:flex;flex-direction:column;min-width:0;border-right:1px solid #27272a}
 .app-main-right{width:470px;display:flex;flex-direction:column;flex-shrink:0;background:#18181b}
@@ -548,7 +361,7 @@ footer{display:none!important}
     overflow:hidden;border:1px solid #27272a;background:#111114;
     display:flex;align-items:center;justify-content:center;position:relative;
 }
-.single-preview-card img,.single-preview-card video{
     width:100%;height:100%;max-width:100%;max-height:100%;
     object-fit:contain;display:block;background:#000;
 }
@@ -782,24 +595,23 @@ function init() {
     const fileInput = document.getElementById('custom-file-input');
     const previewWrap = document.getElementById('single-preview-wrap');
     const previewImg = document.getElementById('single-preview-img');
-    const previewVideo = document.getElementById('single-preview-video');
     const btnUpload = document.getElementById('preview-upload-btn');
     const btnClear = document.getElementById('preview-clear-btn');
     const promptInput = document.getElementById('custom-query-input');
     const runBtnEl = document.getElementById('custom-run-btn');
     const outputArea = document.getElementById('custom-output-textarea');
     const mediaStatus = document.getElementById('sb-media-status');
-    const exampleResultContainer = document.getElementById('example-result-data');
-    if (!dropZone || !fileInput || !promptInput || !previewWrap || !previewImg || !previewVideo) {
         setTimeout(init, 250);
         return;
     }
     window.__visionScopeInitDone = true;
     let mediaState = null;
-    let currentMode = 'image';
     let toastTimer = null;
     function showToast(message, type) {
         let toast = document.getElementById('app-toast');
@@ -851,6 +663,13 @@ function init() {
         setTimeout(() => outputArea.classList.remove('error-flash'), 800);
     }
     function setGradioValue(containerId, value) {
         const container = document.getElementById(containerId);
         if (!container) return;
@@ -866,10 +685,9 @@ function init() {
         });
     }
-    function syncMediaToGradio() {
-        setGradioValue('hidden-image-b64', mediaState && mediaState.mode === 'image' ? mediaState.b64 : '');
-        setGradioValue('hidden-video-b64', mediaState && mediaState.mode === 'video' ? mediaState.b64 : '');
-        const txt = mediaState ? (`1 ${mediaState.mode} uploaded`) : `No ${currentMode} uploaded`;
         if (mediaStatus) mediaStatus.textContent = txt;
     }
@@ -881,43 +699,25 @@ function init() {
         setGradioValue('hidden-model-name', name);
     }
-    function syncModeToGradio(mode) {
-        setGradioValue('hidden-mode-name', mode);
-    }
     function renderPreview() {
         if (!mediaState) {
             previewImg.src = '';
-            previewVideo.src = '';
             previewImg.style.display = 'none';
-            previewVideo.style.display = 'none';
             previewWrap.style.display = 'none';
             if (uploadPrompt) uploadPrompt.style.display = 'flex';
-            syncMediaToGradio();
             return;
         }
-        if (mediaState.mode === 'video') {
-            previewImg.src = '';
-            previewImg.style.display = 'none';
-            previewVideo.src = mediaState.b64;
-            previewVideo.style.display = 'block';
-            previewWrap.style.display = 'flex';
-        } else {
-            previewVideo.pause();
-            previewVideo.removeAttribute('src');
-            previewVideo.load();
-            previewVideo.style.display = 'none';
-            previewImg.src = mediaState.b64;
-            previewImg.style.display = 'block';
-            previewWrap.style.display = 'flex';
-        }
         if (uploadPrompt) uploadPrompt.style.display = 'none';
-        syncMediaToGradio();
     }
-    function setPreview(b64, name, mode) {
-        mediaState = {b64, name: name || 'file', mode: mode || currentMode};
         renderPreview();
     }
     window.__setPreview = setPreview;
@@ -930,40 +730,25 @@ function init() {
     function processFile(file) {
         if (!file) return;
-        if (currentMode === 'image' && !file.type.startsWith('image/')) {
-            showToast('Only image files are supported in Image mode', 'error');
-            return;
-        }
-        if (currentMode === 'video' && !file.type.startsWith('video/')) {
-            showToast('Only video files are supported in Video mode', 'error');
             return;
         }
         const reader = new FileReader();
-        reader.onload = (e) => setPreview(e.target.result, file.name, currentMode);
         reader.readAsDataURL(file);
     }
     fileInput.addEventListener('change', (e) => {
         const file = e.target.files && e.target.files[0] ? e.target.files[0] : null;
         if (file) processFile(file);
         e.target.value = '';
     });
-    function updateAccept() {
-        fileInput.accept = currentMode === 'video' ? 'video/*' : 'image/*';
-        const main = document.getElementById('upload-main-text');
-        const sub = document.getElementById('upload-sub-text');
-        if (main) main.textContent = currentMode === 'video' ? 'Click or drag a video here' : 'Click or drag an image here';
-        if (sub) sub.textContent = currentMode === 'video'
-            ? 'Upload one short video clip for multimodal video understanding'
-            : 'Upload one document, page, receipt, screenshot, or scene image for vision tasks';
-        if (!mediaState && mediaStatus) mediaStatus.textContent = `No ${currentMode} uploaded`;
-    }
-    if (uploadClick) uploadClick.addEventListener('click', () => fileInput.click());
-    if (btnUpload) btnUpload.addEventListener('click', () => fileInput.click());
-    if (btnClear) btnClear.addEventListener('click', clearPreview);
     dropZone.addEventListener('dragover', (e) => {
         e.preventDefault();
         dropZone.classList.add('drag-over');
@@ -988,28 +773,11 @@ function init() {
     }
     window.__activateModelTab = activateModelTab;
-    function activateModeTab(mode) {
-        currentMode = mode;
-        document.querySelectorAll('.mode-tab[data-mode]').forEach(btn => {
-            btn.classList.toggle('active', btn.getAttribute('data-mode') === mode);
-        });
-        syncModeToGradio(mode);
-        updateAccept();
-        if (mediaState && mediaState.mode !== mode) {
-            clearPreview();
-        }
-    }
-    window.__activateModeTab = activateModeTab;
     document.querySelectorAll('.model-tab[data-model]').forEach(btn => {
         btn.addEventListener('click', () => activateModelTab(btn.getAttribute('data-model')));
     });
-    document.querySelectorAll('.mode-tab[data-mode]').forEach(btn => {
-        btn.addEventListener('click', () => activateModeTab(btn.getAttribute('data-mode')));
-    });
     activateModelTab('DeepCaption-VLA-7B');
-    activateModeTab('image');
     function syncSlider(customId, gradioId) {
         const slider = document.getElementById(customId);
@@ -1040,16 +808,12 @@ function init() {
     function validateBeforeRun() {
         const promptVal = promptInput.value.trim();
         if (!mediaState && !promptVal) {
-            showToast(`Please upload a ${currentMode} and enter your instruction`, 'error');
             flashPromptError();
             return false;
         }
         if (!mediaState) {
-            showToast(`Please upload a ${currentMode}`, 'error');
-            return false;
-        }
-        if (mediaState.mode !== currentMode) {
-            showToast(`Uploaded media does not match ${currentMode} mode`, 'error');
             return false;
         }
         if (!promptVal) {
@@ -1068,11 +832,9 @@ function init() {
     window.__clickGradioRunBtn = function() {
         if (!validateBeforeRun()) return;
         syncPromptToGradio();
-        syncMediaToGradio();
         const activeModel = document.querySelector('.model-tab.active');
         if (activeModel) syncModelToGradio(activeModel.getAttribute('data-model'));
-        const activeMode = document.querySelector('.mode-tab.active');
-        if (activeMode) syncModeToGradio(activeMode.getAttribute('data-mode'));
         if (outputArea) outputArea.value = '';
         showLoader();
         setTimeout(() => {
@@ -1126,55 +888,86 @@ function init() {
         });
     }
     document.querySelectorAll('.example-card[data-idx]').forEach(card => {
         card.addEventListener('click', () => {
             const idx = card.getAttribute('data-idx');
             document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
             card.classList.add('loading');
             showToast('Loading example...', 'info');
             setGradioValue('example-result-data', '');
             setGradioValue('example-idx-input', idx);
             setTimeout(() => {
                 const btn = document.getElementById('example-load-btn');
                 if (btn) {
                     const b = btn.querySelector('button');
                     if (b) b.click(); else btn.click();
                 }
-            }, 150);
-            setTimeout(() => card.classList.remove('loading'), 12000);
         });
     });
-    function checkExampleResult() {
-        if (!exampleResultContainer) return;
-        const el = exampleResultContainer.querySelector('textarea') || exampleResultContainer.querySelector('input');
-        if (!el || !el.value) return;
-        if (window.__lastExampleVal === el.value) return;
-        try {
-            const data = JSON.parse(el.value);
-            if (data.status === 'ok') {
-                window.__lastExampleVal = el.value;
-                if (data.mode) activateModeTab(data.mode);
-                if (data.media) setPreview(data.media, data.name || 'example', data.mode || 'image');
-                if (data.query) {
-                    promptInput.value = data.query;
-                    syncPromptToGradio();
                 }
-                if (data.model) activateModelTab(data.model);
-                document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
-                showToast('Example loaded', 'info');
-            } else if (data.status === 'error') {
-                document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
-                showToast(data.message || 'Failed to load example', 'error');
             }
-        } catch(e) {}
-    }
-    const obsExample = new MutationObserver(checkExampleResult);
-    if (exampleResultContainer) {
-        obsExample.observe(exampleResultContainer, {childList:true, subtree:true, characterData:true, attributes:true});
     }
-    setInterval(checkExampleResult, 500);
     if (outputArea) outputArea.value = '';
     const sb = document.getElementById('sb-run-state');
@@ -1236,15 +1029,8 @@ MODEL_TABS_HTML = "".join([
     for m in MODEL_CHOICES
 ])
-MODE_TABS_HTML = """
-<button class="mode-tab active" data-mode="image">Image Inference</button>
-<button class="mode-tab" data-mode="video">Video Inference</button>
-"""
 with gr.Blocks() as demo:
-    hidden_mode_name = gr.Textbox(value="image", elem_id="hidden-mode-name", elem_classes="hidden-input", container=False)
     hidden_image_b64 = gr.Textbox(value="", elem_id="hidden-image-b64", elem_classes="hidden-input", container=False)
-    hidden_video_b64 = gr.Textbox(value="", elem_id="hidden-video-b64", elem_classes="hidden-input", container=False)
     prompt = gr.Textbox(value="", elem_id="prompt-gradio-input", elem_classes="hidden-input", container=False)
     hidden_model_name = gr.Textbox(value="DeepCaption-VLA-7B", elem_id="hidden-model-name", elem_classes="hidden-input", container=False)
@@ -1268,7 +1054,7 @@ with gr.Blocks() as demo:
                 <div class="app-logo">{VISION_LOGO_SVG}</div>
                 <span class="app-title">VisionScope R2</span>
                 <span class="app-badge">vision enabled</span>
-                <span class="app-badge fast">Image + Video</span>
             </div>
         </div>
@@ -1276,10 +1062,6 @@ with gr.Blocks() as demo:
             {MODEL_TABS_HTML}
         </div>
-        <div class="mode-tabs-bar">
-            {MODE_TABS_HTML}
-        </div>
         <div class="app-main-row">
             <div class="app-main-left">
                 <div id="media-drop-zone">
@@ -1296,7 +1078,6 @@ with gr.Blocks() as demo:
                     <div id="single-preview-wrap" class="single-preview-wrap">
                         <div class="single-preview-card">
                             <img id="single-preview-img" src="" alt="Preview" style="display:none;">
-                            <video id="single-preview-video" controls playsinline style="display:none;"></video>
                             <div class="preview-overlay-actions">
                                 <button id="preview-upload-btn" class="preview-action-btn" title="Replace">Upload</button>
                                 <button id="preview-clear-btn" class="preview-action-btn" title="Clear">Clear</button>
@@ -1306,10 +1087,9 @@ with gr.Blocks() as demo:
                 </div>
                 <div class="hint-bar">
-                    <b>Upload:</b> Click or drag media into the panel &nbsp;&middot;&nbsp;
-                    <b>Mode:</b> Switch between image and video inference &nbsp;&middot;&nbsp;
                     <b>Model:</b> Change models from the header &nbsp;&middot;&nbsp;
-                    <kbd>Clear</kbd> removes the current media
                 </div>
                 <div class="examples-section">
@@ -1325,7 +1105,7 @@ with gr.Blocks() as demo:
                     <div class="panel-card-title">Vision Instruction</div>
                     <div class="panel-card-body">
                         <label class="modern-label" for="custom-query-input">Query Input</label>
-                        <textarea id="custom-query-input" class="modern-textarea" rows="4" placeholder="e.g., describe the scene, read the handwriting, explain the video, summarize frames, extract visible text, estimate distance..."></textarea>
                     </div>
                 </div>
@@ -1412,11 +1192,9 @@ with gr.Blocks() as demo:
     run_btn.click(
         fn=run_inference,
         inputs=[
-            hidden_mode_name,
             hidden_model_name,
             prompt,
             hidden_image_b64,
-            hidden_video_b64,
             max_new_tokens,
             temperature,
             top_p,
@@ -1425,30 +1203,20 @@ with gr.Blocks() as demo:
             gpu_duration_state,
         ],
         outputs=[result],
-        js=r"""(mode, model, p, img, vid, mnt, t, tp, tk, rp, gd) => {
             const modelEl = document.querySelector('.model-tab.active');
-            const modeEl = document.querySelector('.mode-tab.active');
             const modelVal = modelEl ? modelEl.getAttribute('data-model') : model;
-            const modeVal = modeEl ? modeEl.getAttribute('data-mode') : mode;
             const promptEl = document.getElementById('custom-query-input');
             const promptVal = promptEl ? promptEl.value : p;
             let imgVal = img;
-            let vidVal = vid;
             const imgContainer = document.getElementById('hidden-image-b64');
-            const vidContainer = document.getElementById('hidden-video-b64');
             if (imgContainer) {
                 const inner = imgContainer.querySelector('textarea, input');
                 if (inner) imgVal = inner.value;
             }
-            if (vidContainer) {
-                const inner = vidContainer.querySelector('textarea, input');
-                if (inner) vidVal = inner.value;
-            }
-            return [modeVal, modelVal, promptVal, imgVal, vidVal, mnt, t, tp, tk, rp, gd];
         }""",
     )
@@ -1465,5 +1233,5 @@ if __name__ == "__main__":
         mcp_server=True,
         ssr_mode=False,
         show_error=True,
-        allowed_paths=["images", "videos"],
     )

 import os
 import gc
 import json
 import time
 import base64
 from io import BytesIO
 from threading import Thread
 import gradio as gr
 import spaces
 import torch
+from PIL import Image
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 print("Using device:", device)
 MODEL_ID_N = "prithivMLmods/DeepCaption-VLA-7B"
 processor_n = AutoProcessor.from_pretrained(MODEL_ID_N, trust_remote_code=True)
 model_n = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 MODEL_CHOICES = list(MODEL_MAP.keys())
 image_examples = [
+    {"query": "type out the messy hand-writing as accurately as you can.", "media": "images/1.jpg", "model": "coreOCR-7B-050325-preview"},
+    {"query": "count the number of birds and explain the scene in detail.", "media": "images/2.jpeg", "model": "DeepCaption-VLA-7B"},
+    {"query": "how far is the Goal from the penalty taker in this image?.", "media": "images/3.png", "model": "SpaceThinker-3B"},
+    {"query": "approximately how many meters apart are the chair and bookshelf?.", "media": "images/4.png", "model": "SkyCaptioner-V1"},
+    {"query": "how far is the man in the red hat from the pallet of boxes in feet?.", "media": "images/5.jpg", "model": "SpaceOm-3B"},
 ]
 def pil_to_data_url(img: Image.Image, fmt="PNG"):
     buf = BytesIO()
         "jpeg": "image/jpeg",
         "png": "image/png",
         "webp": "image/webp",
+    }.get(ext, "image/jpeg")
     with open(path, "rb") as f:
         data = base64.b64encode(f.read()).decode()
     return f"data:{mime};base64,{data}"
+def make_thumb_b64(path, max_dim=240):
     try:
+        img = Image.open(path).convert("RGB")
         img.thumbnail((max_dim, max_dim))
         return pil_to_data_url(img, "JPEG")
     except Exception as e:
 def build_example_cards_html():
     cards = ""
+    for i, ex in enumerate(image_examples):
+        thumb = make_thumb_b64(ex["media"])
         prompt_short = ex["query"][:72] + ("..." if len(ex["query"]) > 72 else "")
         cards += f"""
         <div class="example-card" data-idx="{i}">
             <div class="example-thumb-wrap">
                 {"<img src='" + thumb + "' alt=''>" if thumb else "<div class='example-thumb-placeholder'>Preview</div>"}
+                <div class="example-media-chip">IMAGE</div>
             </div>
             <div class="example-meta-row">
                 <span class="example-badge">{ex["model"]}</span>
         idx = int(float(idx_str))
     except Exception:
         return json.dumps({"status": "error", "message": "Invalid example index"})
+    if idx < 0 or idx >= len(image_examples):
         return json.dumps({"status": "error", "message": "Example index out of range"})
+    ex = image_examples[idx]
     media_b64 = file_to_data_url(ex["media"])
     if not media_b64:
+        return json.dumps({"status": "error", "message": "Could not load example image"})
     return json.dumps({
         "status": "ok",
         "query": ex["query"],
         "media": media_b64,
         "model": ex["model"],
         "name": os.path.basename(ex["media"]),
     })
         return None
 def calc_timeout_image(model_name, text, image, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_timeout):
     try:
         return int(gpu_timeout)
         return 60
 @spaces.GPU(duration=calc_timeout_image)
 def generate_image(model_name, text, image, max_new_tokens=1024, temperature=0.6, top_p=0.9, top_k=50, repetition_penalty=1.2, gpu_timeout=60):
     if not model_name or model_name not in MODEL_MAP:
         torch.cuda.empty_cache()
+def run_inference(model_name, text, image_b64, max_new_tokens_v, temperature_v, top_p_v, top_k_v, repetition_penalty_v, gpu_timeout_v):
+    image = b64_to_pil(image_b64)
+    yield from generate_image(
+        model_name=model_name,
+        text=text,
+        image=image,
+        max_new_tokens=max_new_tokens_v,
+        temperature=temperature_v,
+        top_p=top_p_v,
+        top_k=top_k_v,
+        repetition_penalty=repetition_penalty_v,
+        gpu_timeout=gpu_timeout_v,
+    )
 def noop():
 .model-tab.active{background:rgba(0,0,255,.22);border-color:#0000FF;color:#fff!important;box-shadow:0 0 0 2px rgba(0,0,255,.10)}
 .model-tab-label{font-size:12px;color:#ffffff!important;font-weight:600}
 .app-main-row{display:flex;gap:0;flex:1;overflow:hidden}
 .app-main-left{flex:1;display:flex;flex-direction:column;min-width:0;border-right:1px solid #27272a}
 .app-main-right{width:470px;display:flex;flex-direction:column;flex-shrink:0;background:#18181b}
     overflow:hidden;border:1px solid #27272a;background:#111114;
     display:flex;align-items:center;justify-content:center;position:relative;
 }
+.single-preview-card img{
     width:100%;height:100%;max-width:100%;max-height:100%;
     object-fit:contain;display:block;background:#000;
 }
     const fileInput = document.getElementById('custom-file-input');
     const previewWrap = document.getElementById('single-preview-wrap');
     const previewImg = document.getElementById('single-preview-img');
     const btnUpload = document.getElementById('preview-upload-btn');
     const btnClear = document.getElementById('preview-clear-btn');
     const promptInput = document.getElementById('custom-query-input');
     const runBtnEl = document.getElementById('custom-run-btn');
     const outputArea = document.getElementById('custom-output-textarea');
     const mediaStatus = document.getElementById('sb-media-status');
+    if (!dropZone || !fileInput || !promptInput || !previewWrap || !previewImg) {
         setTimeout(init, 250);
         return;
     }
     window.__visionScopeInitDone = true;
     let mediaState = null;
     let toastTimer = null;
+    let examplePoller = null;
+    let lastSeenExamplePayload = null;
     function showToast(message, type) {
         let toast = document.getElementById('app-toast');
         setTimeout(() => outputArea.classList.remove('error-flash'), 800);
     }
+    function getValueFromContainer(containerId) {
+        const container = document.getElementById(containerId);
+        if (!container) return '';
+        const el = container.querySelector('textarea, input');
+        return el ? (el.value || '') : '';
+    }
     function setGradioValue(containerId, value) {
         const container = document.getElementById(containerId);
         if (!container) return;
         });
     }
+    function syncImageToGradio() {
+        setGradioValue('hidden-image-b64', mediaState ? mediaState.b64 : '');
+        const txt = mediaState ? '1 image uploaded' : 'No image uploaded';
         if (mediaStatus) mediaStatus.textContent = txt;
     }
         setGradioValue('hidden-model-name', name);
     }
     function renderPreview() {
         if (!mediaState) {
             previewImg.src = '';
             previewImg.style.display = 'none';
             previewWrap.style.display = 'none';
             if (uploadPrompt) uploadPrompt.style.display = 'flex';
+            syncImageToGradio();
             return;
         }
+        previewImg.src = mediaState.b64;
+        previewImg.style.display = 'block';
+        previewWrap.style.display = 'flex';
         if (uploadPrompt) uploadPrompt.style.display = 'none';
+        syncImageToGradio();
     }
+    function setPreview(b64, name) {
+        mediaState = {b64, name: name || 'file'};
         renderPreview();
     }
     window.__setPreview = setPreview;
     function processFile(file) {
         if (!file) return;
+        if (!file.type.startsWith('image/')) {
+            showToast('Only image files are supported', 'error');
             return;
         }
         const reader = new FileReader();
+        reader.onload = (e) => setPreview(e.target.result, file.name);
         reader.readAsDataURL(file);
     }
+    if (uploadClick) uploadClick.addEventListener('click', () => fileInput.click());
+    if (btnUpload) btnUpload.addEventListener('click', () => fileInput.click());
+    if (btnClear) btnClear.addEventListener('click', clearPreview);
     fileInput.addEventListener('change', (e) => {
         const file = e.target.files && e.target.files[0] ? e.target.files[0] : null;
         if (file) processFile(file);
         e.target.value = '';
     });
     dropZone.addEventListener('dragover', (e) => {
         e.preventDefault();
         dropZone.classList.add('drag-over');
     }
     window.__activateModelTab = activateModelTab;
     document.querySelectorAll('.model-tab[data-model]').forEach(btn => {
         btn.addEventListener('click', () => activateModelTab(btn.getAttribute('data-model')));
     });
     activateModelTab('DeepCaption-VLA-7B');
     function syncSlider(customId, gradioId) {
         const slider = document.getElementById(customId);
     function validateBeforeRun() {
         const promptVal = promptInput.value.trim();
         if (!mediaState && !promptVal) {
+            showToast('Please upload an image and enter your instruction', 'error');
             flashPromptError();
             return false;
         }
         if (!mediaState) {
+            showToast('Please upload an image', 'error');
             return false;
         }
         if (!promptVal) {
     window.__clickGradioRunBtn = function() {
         if (!validateBeforeRun()) return;
         syncPromptToGradio();
+        syncImageToGradio();
         const activeModel = document.querySelector('.model-tab.active');
         if (activeModel) syncModelToGradio(activeModel.getAttribute('data-model'));
         if (outputArea) outputArea.value = '';
         showLoader();
         setTimeout(() => {
         });
     }
+    function applyExamplePayload(raw) {
+        try {
+            const data = JSON.parse(raw);
+            if (data.status === 'ok') {
+                if (data.media) setPreview(data.media, data.name || 'example_file');
+                if (data.query) {
+                    promptInput.value = data.query;
+                    syncPromptToGradio();
+                }
+                if (data.model) activateModelTab(data.model);
+                document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
+                showToast('Example loaded', 'info');
+            } else if (data.status === 'error') {
+                document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
+                showToast(data.message || 'Failed to load example', 'error');
+            }
+        } catch (e) {
+            document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
+            showToast('Failed to parse example data', 'error');
+        }
+    }
+    function startExamplePolling() {
+        if (examplePoller) clearInterval(examplePoller);
+        let attempts = 0;
+        examplePoller = setInterval(() => {
+            attempts += 1;
+            const current = getValueFromContainer('example-result-data');
+            if (current && current !== lastSeenExamplePayload) {
+                lastSeenExamplePayload = current;
+                clearInterval(examplePoller);
+                examplePoller = null;
+                applyExamplePayload(current);
+                return;
+            }
+            if (attempts >= 80) {
+                clearInterval(examplePoller);
+                examplePoller = null;
+                document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
+                showToast('Example load timed out', 'error');
+            }
+        }, 150);
+    }
     document.querySelectorAll('.example-card[data-idx]').forEach(card => {
         card.addEventListener('click', () => {
             const idx = card.getAttribute('data-idx');
             document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
             card.classList.add('loading');
             showToast('Loading example...', 'info');
             setGradioValue('example-result-data', '');
             setGradioValue('example-idx-input', idx);
             setTimeout(() => {
                 const btn = document.getElementById('example-load-btn');
                 if (btn) {
                     const b = btn.querySelector('button');
                     if (b) b.click(); else btn.click();
                 }
+                startExamplePolling();
+            }, 220);
         });
     });
+    const observerTarget = document.getElementById('example-result-data');
+    if (observerTarget) {
+        const obs = new MutationObserver(() => {
+            const current = getValueFromContainer('example-result-data');
+            if (current && current !== lastSeenExamplePayload) {
+                lastSeenExamplePayload = current;
+                if (examplePoller) {
+                    clearInterval(examplePoller);
+                    examplePoller = null;
                 }
+                applyExamplePayload(current);
             }
+        });
+        obs.observe(observerTarget, {childList:true, subtree:true, characterData:true, attributes:true});
     }
     if (outputArea) outputArea.value = '';
     const sb = document.getElementById('sb-run-state');
     for m in MODEL_CHOICES
 ])
 with gr.Blocks() as demo:
     hidden_image_b64 = gr.Textbox(value="", elem_id="hidden-image-b64", elem_classes="hidden-input", container=False)
     prompt = gr.Textbox(value="", elem_id="prompt-gradio-input", elem_classes="hidden-input", container=False)
     hidden_model_name = gr.Textbox(value="DeepCaption-VLA-7B", elem_id="hidden-model-name", elem_classes="hidden-input", container=False)
                 <div class="app-logo">{VISION_LOGO_SVG}</div>
                 <span class="app-title">VisionScope R2</span>
                 <span class="app-badge">vision enabled</span>
+                <span class="app-badge fast">Image Inference</span>
             </div>
         </div>
             {MODEL_TABS_HTML}
         </div>
         <div class="app-main-row">
             <div class="app-main-left">
                 <div id="media-drop-zone">
                     <div id="single-preview-wrap" class="single-preview-wrap">
                         <div class="single-preview-card">
                             <img id="single-preview-img" src="" alt="Preview" style="display:none;">
                             <div class="preview-overlay-actions">
                                 <button id="preview-upload-btn" class="preview-action-btn" title="Replace">Upload</button>
                                 <button id="preview-clear-btn" class="preview-action-btn" title="Clear">Clear</button>
                 </div>
                 <div class="hint-bar">
+                    <b>Upload:</b> Click or drag an image into the panel &nbsp;&middot;&nbsp;
                     <b>Model:</b> Change models from the header &nbsp;&middot;&nbsp;
+                    <kbd>Clear</kbd> removes the current image
                 </div>
                 <div class="examples-section">
                     <div class="panel-card-title">Vision Instruction</div>
                     <div class="panel-card-body">
                         <label class="modern-label" for="custom-query-input">Query Input</label>
+                        <textarea id="custom-query-input" class="modern-textarea" rows="4" placeholder="e.g., describe the scene, read the handwriting, extract visible text, estimate distance..."></textarea>
                     </div>
                 </div>
     run_btn.click(
         fn=run_inference,
         inputs=[
             hidden_model_name,
             prompt,
             hidden_image_b64,
             max_new_tokens,
             temperature,
             top_p,
             gpu_duration_state,
         ],
         outputs=[result],
+        js=r"""(model, p, img, mnt, t, tp, tk, rp, gd) => {
             const modelEl = document.querySelector('.model-tab.active');
             const modelVal = modelEl ? modelEl.getAttribute('data-model') : model;
             const promptEl = document.getElementById('custom-query-input');
             const promptVal = promptEl ? promptEl.value : p;
             let imgVal = img;
             const imgContainer = document.getElementById('hidden-image-b64');
             if (imgContainer) {
                 const inner = imgContainer.querySelector('textarea, input');
                 if (inner) imgVal = inner.value;
             }
+            return [modelVal, promptVal, imgVal, mnt, t, tp, tk, rp, gd];
         }""",
     )
         mcp_server=True,
         ssr_mode=False,
         show_error=True,
+        allowed_paths=["images"],
     )