Instructions to use DeepXR/Helion-V2.5-Rnd with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use DeepXR/Helion-V2.5-Rnd with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="DeepXR/Helion-V2.5-Rnd", trust_remote_code=True)

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("DeepXR/Helion-V2.5-Rnd", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("DeepXR/Helion-V2.5-Rnd", trust_remote_code=True)

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use DeepXR/Helion-V2.5-Rnd with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "DeepXR/Helion-V2.5-Rnd"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "DeepXR/Helion-V2.5-Rnd",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker

docker model run hf.co/DeepXR/Helion-V2.5-Rnd

SGLang

How to use DeepXR/Helion-V2.5-Rnd with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "DeepXR/Helion-V2.5-Rnd" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "DeepXR/Helion-V2.5-Rnd",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "DeepXR/Helion-V2.5-Rnd" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "DeepXR/Helion-V2.5-Rnd",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Docker Model Runner
How to use DeepXR/Helion-V2.5-Rnd with Docker Model Runner:
```
docker model run hf.co/DeepXR/Helion-V2.5-Rnd
```

Helion-V2.5-Rnd / inference /benchmark.py

Trouter-Library

Create inference/benchmark.py

0574c09 verified 6 months ago

raw

history blame

15.1 kB

	#!/usr/bin/env python3
	"""
	Helion-2.5-Rnd Benchmark Runner
	Comprehensive benchmarking suite for performance testing
	"""

	import argparse
	import json
	import logging
	import statistics
	import time
	from collections import defaultdict
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from datetime import datetime
	from pathlib import Path
	from typing import Dict, List, Optional

	import numpy as np

	from inference.client import HelionClient

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)


	class BenchmarkRunner:
	"""Run comprehensive benchmarks on Helion model"""

	def __init__(
	self,
	base_url: str = "http://localhost:8000",
	output_dir: str = "./benchmark_results"
	):
	"""
	Initialize benchmark runner

	Args:
	base_url: API base URL
	output_dir: Directory for results
	"""
	self.client = HelionClient(base_url=base_url)
	self.output_dir = Path(output_dir)
	self.output_dir.mkdir(parents=True, exist_ok=True)

	self.results = {
	'timestamp': datetime.now().isoformat(),
	'base_url': base_url,
	'tests': {}
	}

	def benchmark_latency(
	self,
	num_requests: int = 100,
	prompt_lengths: List[int] = [128, 512, 2048],
	max_tokens: int = 256
	) -> Dict:
	"""
	Benchmark inference latency

	Args:
	num_requests: Number of requests per test
	prompt_lengths: Different prompt lengths to test
	max_tokens: Maximum tokens to generate

	Returns:
	Latency benchmark results
	"""
	logger.info("Running latency benchmark...")

	results = {}

	for prompt_len in prompt_lengths:
	logger.info(f"Testing prompt length: {prompt_len}")

	# Generate test prompt
	test_prompt = "Hello world. " * (prompt_len // 13)

	latencies = []
	first_token_latencies = []

	for i in range(num_requests):
	try:
	start_time = time.time()

	response = self.client.complete(
	prompt=test_prompt,
	max_tokens=max_tokens,
	temperature=0.7,
	stream=False
	)

	end_time = time.time()
	latency = (end_time - start_time) * 1000 # Convert to ms

	latencies.append(latency)

	if i % 10 == 0:
	logger.info(f" Progress: {i+1}/{num_requests}")

	except Exception as e:
	logger.error(f"Request failed: {e}")

	if latencies:
	results[f"prompt_{prompt_len}"] = {
	'num_samples': len(latencies),
	'mean_ms': statistics.mean(latencies),
	'median_ms': statistics.median(latencies),
	'std_dev_ms': statistics.stdev(latencies) if len(latencies) > 1 else 0,
	'min_ms': min(latencies),
	'max_ms': max(latencies),
	'p50_ms': np.percentile(latencies, 50),
	'p90_ms': np.percentile(latencies, 90),
	'p95_ms': np.percentile(latencies, 95),
	'p99_ms': np.percentile(latencies, 99)
	}

	return results

	def benchmark_throughput(
	self,
	duration_seconds: int = 60,
	concurrent_requests: int = 10,
	prompt_length: int = 512,
	max_tokens: int = 128
	) -> Dict:
	"""
	Benchmark throughput with concurrent requests

	Args:
	duration_seconds: How long to run test
	concurrent_requests: Number of concurrent requests
	prompt_length: Prompt length for testing
	max_tokens: Maximum tokens to generate

	Returns:
	Throughput benchmark results
	"""
	logger.info(f"Running throughput benchmark for {duration_seconds}s...")

	test_prompt = "The quick brown fox jumps over the lazy dog. " * (prompt_length // 45)

	start_time = time.time()
	end_time = start_time + duration_seconds

	completed_requests = 0
	failed_requests = 0
	total_tokens = 0
	latencies = []

	def make_request():
	try:
	req_start = time.time()
	response = self.client.complete(
	prompt=test_prompt,
	max_tokens=max_tokens,
	temperature=0.7
	)
	req_end = time.time()

	return {
	'success': True,
	'latency': req_end - req_start,
	'tokens': len(response.split()) # Approximate
	}
	except Exception as e:
	return {'success': False, 'error': str(e)}

	with ThreadPoolExecutor(max_workers=concurrent_requests) as executor:
	while time.time() < end_time:
	futures = [executor.submit(make_request) for _ in range(concurrent_requests)]

	for future in as_completed(futures):
	result = future.result()

	if result['success']:
	completed_requests += 1
	latencies.append(result['latency'] * 1000)
	total_tokens += result.get('tokens', 0)
	else:
	failed_requests += 1

	actual_duration = time.time() - start_time

	return {
	'duration_seconds': actual_duration,
	'concurrent_requests': concurrent_requests,
	'completed_requests': completed_requests,
	'failed_requests': failed_requests,
	'requests_per_second': completed_requests / actual_duration,
	'total_tokens': total_tokens,
	'tokens_per_second': total_tokens / actual_duration,
	'avg_latency_ms': statistics.mean(latencies) if latencies else 0,
	'p95_latency_ms': np.percentile(latencies, 95) if latencies else 0
	}

	def benchmark_context_length(
	self,
	context_lengths: List[int] = [1024, 4096, 16384, 65536],
	num_samples: int = 10
	) -> Dict:
	"""
	Benchmark performance across different context lengths

	Args:
	context_lengths: List of context lengths to test
	num_samples: Number of samples per length

	Returns:
	Context length benchmark results
	"""
	logger.info("Running context length benchmark...")

	results = {}

	for ctx_len in context_lengths:
	logger.info(f"Testing context length: {ctx_len}")

	# Generate long context
	base_text = "This is a test sentence for context length benchmarking. "
	long_prompt = base_text * (ctx_len // len(base_text))
	long_prompt = long_prompt[:ctx_len] + "\n\nSummarize the above text:"

	latencies = []

	for i in range(num_samples):
	try:
	start_time = time.time()

	response = self.client.complete(
	prompt=long_prompt,
	max_tokens=256,
	temperature=0.5
	)

	end_time = time.time()
	latencies.append((end_time - start_time) * 1000)

	except Exception as e:
	logger.error(f"Context length {ctx_len} failed: {e}")

	if latencies:
	results[f"context_{ctx_len}"] = {
	'mean_latency_ms': statistics.mean(latencies),
	'median_latency_ms': statistics.median(latencies),
	'std_dev_ms': statistics.stdev(latencies) if len(latencies) > 1 else 0
	}

	return results

	def benchmark_generation_quality(
	self,
	test_prompts: Optional[List[str]] = None,
	num_samples: int = 5
	) -> Dict:
	"""
	Benchmark generation quality with diverse prompts

	Args:
	test_prompts: Custom test prompts
	num_samples: Number of samples per prompt type

	Returns:
	Quality benchmark results
	"""
	logger.info("Running generation quality benchmark...")

	if test_prompts is None:
	test_prompts = [
	"Explain quantum computing in simple terms:",
	"Write a Python function to calculate fibonacci numbers:",
	"Translate 'Hello, how are you?' to Spanish, French, and German:",
	"Solve: If x + 5 = 12, what is x?",
	"Write a haiku about artificial intelligence:"
	]

	results = {}

	for i, prompt in enumerate(test_prompts):
	logger.info(f"Testing prompt {i+1}/{len(test_prompts)}")

	responses = []

	for _ in range(num_samples):
	try:
	response = self.client.complete(
	prompt=prompt,
	max_tokens=512,
	temperature=0.7
	)
	responses.append(response)
	except Exception as e:
	logger.error(f"Generation failed: {e}")

	if responses:
	results[f"prompt_{i+1}"] = {
	'prompt': prompt[:50] + "...",
	'num_responses': len(responses),
	'avg_length': statistics.mean([len(r) for r in responses]),
	'sample_response': responses[0][:200] + "..."
	}

	return results

	def run_all_benchmarks(self, quick_mode: bool = False) -> Dict:
	"""
	Run all benchmark suites

	Args:
	quick_mode: Run faster with fewer samples

	Returns:
	Complete benchmark results
	"""
	logger.info("Starting comprehensive benchmark suite...")

	if quick_mode:
	logger.info("Running in quick mode (fewer samples)")

	# Latency benchmark
	logger.info("\n=== Latency Benchmark ===")
	self.results['tests']['latency'] = self.benchmark_latency(
	num_requests=20 if quick_mode else 100,
	prompt_lengths=[128, 512] if quick_mode else [128, 512, 2048]
	)

	# Throughput benchmark
	logger.info("\n=== Throughput Benchmark ===")
	self.results['tests']['throughput'] = self.benchmark_throughput(
	duration_seconds=30 if quick_mode else 60,
	concurrent_requests=5 if quick_mode else 10
	)

	# Context length benchmark
	logger.info("\n=== Context Length Benchmark ===")
	self.results['tests']['context_length'] = self.benchmark_context_length(
	context_lengths=[1024, 4096] if quick_mode else [1024, 4096, 16384],
	num_samples=5 if quick_mode else 10
	)

	# Generation quality
	logger.info("\n=== Generation Quality Benchmark ===")
	self.results['tests']['generation_quality'] = self.benchmark_generation_quality(
	num_samples=2 if quick_mode else 5
	)

	return self.results

	def save_results(self, filename: Optional[str] = None):
	"""
	Save benchmark results to file

	Args:
	filename: Output filename
	"""
	if filename is None:
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	filename = f"benchmark_{timestamp}.json"

	output_path = self.output_dir / filename

	with open(output_path, 'w') as f:
	json.dump(self.results, f, indent=2)

	logger.info(f"Results saved to {output_path}")

	def print_summary(self):
	"""Print benchmark summary"""
	logger.info("\n" + "="*60)
	logger.info("BENCHMARK SUMMARY")
	logger.info("="*60)

	if 'latency' in self.results['tests']:
	logger.info("\nLatency Results:")
	for prompt_type, metrics in self.results['tests']['latency'].items():
	logger.info(f" {prompt_type}:")
	logger.info(f" Mean: {metrics['mean_ms']:.2f}ms")
	logger.info(f" P95: {metrics['p95_ms']:.2f}ms")
	logger.info(f" P99: {metrics['p99_ms']:.2f}ms")

	if 'throughput' in self.results['tests']:
	logger.info("\nThroughput Results:")
	metrics = self.results['tests']['throughput']
	logger.info(f" Requests/sec: {metrics['requests_per_second']:.2f}")
	logger.info(f" Tokens/sec: {metrics['tokens_per_second']:.2f}")
	logger.info(f" Avg Latency: {metrics['avg_latency_ms']:.2f}ms")

	logger.info("\n" + "="*60)


	def main():
	"""Main entry point"""
	parser = argparse.ArgumentParser(description="Helion Benchmark Runner")
	parser.add_argument("--base-url", type=str, default="http://localhost:8000")
	parser.add_argument("--output-dir", type=str, default="./benchmark_results")
	parser.add_argument("--quick", action="store_true", help="Run quick benchmark")
	parser.add_argument("--test", type=str, choices=['latency', 'throughput', 'context', 'quality', 'all'],
	default='all', help="Specific test to run")

	args = parser.parse_args()

	runner = BenchmarkRunner(
	base_url=args.base_url,
	output_dir=args.output_dir
	)

	if args.test == 'all':
	results = runner.run_all_benchmarks(quick_mode=args.quick)
	elif args.test == 'latency':
	results = runner.benchmark_latency(num_requests=20 if args.quick else 100)
	elif args.test == 'throughput':
	results = runner.benchmark_throughput(duration_seconds=30 if args.quick else 60)
	elif args.test == 'context':
	results = runner.benchmark_context_length(num_samples=5 if args.quick else 10)
	elif args.test == 'quality':
	results = runner.benchmark_generation_quality(num_samples=2 if args.quick else 5)

	runner.save_results()
	runner.print_summary()


	if __name__ == "__main__":
	main()