| import gradio as gr |
| import time |
| from apscheduler.schedulers.background import BackgroundScheduler |
| import threading |
| import globals |
| from utils.io import save_results, load_results, load_models_providers, get_results_table, load_models_providers_str, get_summary_stats |
| from utils.jobs import run_single_job, run_multiple_jobs, launch_jobs, update_job_statuses, relaunch_failed_jobs |
| from typing import List, Optional |
|
|
|
|
| def status_monitor() -> None: |
| """Background thread to monitor job statuses.""" |
| while True: |
| update_job_statuses() |
| time.sleep(240) |
|
|
|
|
| def daily_checkpoint() -> None: |
| """Daily checkpoint - save current state.""" |
| print("Daily checkpoint - saving current state") |
| save_results() |
|
|
|
|
| |
| def create_app() -> gr.Blocks: |
| with gr.Blocks(title="Inference Provider Testing Dashboard") as demo: |
| with gr.Tab("Main"): |
| gr.Markdown("# Inference Provider Testing Dashboard") |
| gr.Markdown("Launch and monitor evaluation jobs for multiple models and providers.") |
|
|
| |
| with gr.Row(): |
| with gr.Column(scale=2): |
| model_input = gr.Textbox( |
| label="Model", |
| placeholder="e.g., meta-llama/Llama-3.3-70B-Instruct", |
| info="Enter HuggingFace model ID" |
| ) |
| with gr.Column(scale=1): |
| provider_input = gr.Textbox( |
| label="Provider", |
| placeholder="e.g., together-ai", |
| info="Enter inference provider name" |
| ) |
| with gr.Column(scale=1): |
| launch_single_btn = gr.Button("Launch Job", variant="primary") |
|
|
| |
| with gr.Row(): |
| launch_btn = gr.Button("Launch All Jobs", variant="secondary", scale=2) |
| relaunch_failed_btn = gr.Button("Relaunch Failed", variant="stop", scale=1) |
| refresh_btn = gr.Button("🔄 Refresh", variant="secondary", scale=1) |
|
|
| output = gr.Textbox(label="Status", interactive=False) |
|
|
| |
| summary_stats = gr.Markdown(value=get_summary_stats()) |
|
|
| with gr.Row(): |
| with gr.Column(): |
| gr.Markdown("## Job Results") |
| results_table = gr.Dataframe( |
| value=get_results_table(), |
| interactive=True, |
| show_search="search", |
| show_copy_button=True, |
| show_fullscreen_button=True, |
| wrap=True, |
| static_columns=list(range(11)), |
| datatype=["str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "html", "str"], |
| elem_id="results_table" |
| ) |
|
|
|
|
| |
| def launch_single_and_update(model: str, provider: str): |
| """Launch multiple jobs for a model-provider combination and return updated table and stats.""" |
| if not model or not provider: |
| return "❌ Please provide both model and provider", get_results_table(), get_summary_stats() |
|
|
| job_ids = run_multiple_jobs(model, provider, globals.TASKS) |
| if not job_ids: |
| return "❌ Failed to launch jobs (may already be running)", get_results_table(), get_summary_stats() |
|
|
| save_results() |
| return f"✅ Launched {len(job_ids)} jobs for {model} on {provider}", get_results_table(), get_summary_stats() |
|
|
| launch_single_btn.click( |
| fn=launch_single_and_update, |
| inputs=[model_input, provider_input], |
| outputs=[output, results_table, summary_stats] |
| ) |
|
|
| def launch_and_update(): |
| """Launch jobs and return updated table and stats.""" |
| result = launch_jobs() |
| return result, get_results_table(), get_summary_stats() |
|
|
| def relaunch_and_update(): |
| """Relaunch failed jobs and return updated table and stats.""" |
| result = relaunch_failed_jobs() |
| return result, get_results_table(), get_summary_stats() |
|
|
| launch_btn.click( |
| fn=launch_and_update, |
| outputs=[output, results_table, summary_stats] |
| ) |
|
|
| relaunch_failed_btn.click( |
| fn=relaunch_and_update, |
| outputs=[output, results_table, summary_stats] |
| ) |
|
|
| def refresh_display(): |
| """Refresh the table and stats display.""" |
| return get_results_table(), get_summary_stats() |
|
|
| refresh_btn.click( |
| fn=refresh_display, |
| outputs=[results_table, summary_stats] |
| ) |
|
|
| |
| def handle_table_select(evt: gr.SelectData): |
| """Handle when a cell in the results table is clicked.""" |
| print(f"[Relaunch] Cell selected - Row: {evt.index[0]}, Col: {evt.index[1]}, Value: {evt.value}") |
|
|
| |
| if evt.index[1] == 11: |
| |
| df = get_results_table() |
| row_data = df.data.iloc[evt.index[0]] |
|
|
| model = row_data['Model'] |
| provider = row_data['Provider'] |
| print(f"[Relaunch] Relaunching {globals.NUM_RUNS_PER_JOB} jobs - Model: {model}, Provider: {provider}") |
|
|
| run_multiple_jobs(model, provider, globals.TASKS) |
| |
| save_results() |
|
|
| |
| return get_results_table(), get_summary_stats() |
|
|
| results_table.select( |
| fn=handle_table_select, |
| inputs=[], |
| outputs=[results_table, summary_stats] |
| ) |
|
|
| |
| def auto_refresh(): |
| """Auto-refresh table and summary stats.""" |
| return get_results_table(), get_summary_stats() |
|
|
| |
| timer = gr.Timer(value=30, active=True) |
| timer.tick( |
| fn=auto_refresh, |
| inputs=[], |
| outputs=[results_table, summary_stats] |
| ) |
| with gr.Tab("About"): |
| gr.Markdown(""" |
| In this demo, we run 10 samples of 3 evaluations: ifeval (instruction following), gsm_plus (grade school math problems, less contaminated than gsm8k) and gpqa, diamond subset (knowledge), with `lighteval`, `inference-providers` and `jobs`. |
| |
| The "status" column indicates whether the evaluation failed completely (usually because of the provider was down or because we were rate limited). |
| |
| To run any of these locally, you can use the following |
| ```python |
| from huggingface_hub import run_job, inspect_job, whoami |
| job = run_job( |
| image="hf.co/spaces/OpenEvals/EvalsOnTheHub", |
| command=[ |
| "lighteval", "endpoint", "inference-providers", |
| "model_name=MODEL,provider=PROVIDER", |
| "extended|ifeval|0,lighteval|gpqa:diamond|0", |
| "--push-to-hub", "--save-details", |
| "--results-org", "YOURORG" |
| ], |
| namespace="huggingface", |
| secrets={"HF_TOKEN": YOURTOKEN}, |
| token=YOURTOKEN |
| ) |
| ``` |
| """) |
|
|
| return demo |
|
|
|
|
| if __name__ == "__main__": |
| |
| load_results() |
| print("Starting Inference Provider Testing Dashboard") |
|
|
| |
| monitor_thread = threading.Thread(target=status_monitor, daemon=True) |
| monitor_thread.start() |
| print("Job status monitor started") |
|
|
| |
| scheduler = BackgroundScheduler() |
| scheduler.add_job(daily_checkpoint, 'cron', hour=0, minute=0) |
| scheduler.start() |
| print("Daily checkpoint scheduler started (saves at 00:00)") |
|
|
| |
| demo = create_app() |
| demo.launch(server_name="0.0.0.0", server_port=7860) |
|
|