Text Generation
Transformers
Safetensors
llama
research
code
mathematics
reasoning
multilingual
long-context
custom_code
text-generation-inference
Instructions to use DeepXR/Helion-V2.5-Rnd with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use DeepXR/Helion-V2.5-Rnd with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="DeepXR/Helion-V2.5-Rnd", trust_remote_code=True)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("DeepXR/Helion-V2.5-Rnd", trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained("DeepXR/Helion-V2.5-Rnd", trust_remote_code=True) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use DeepXR/Helion-V2.5-Rnd with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "DeepXR/Helion-V2.5-Rnd" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "DeepXR/Helion-V2.5-Rnd", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/DeepXR/Helion-V2.5-Rnd
- SGLang
How to use DeepXR/Helion-V2.5-Rnd with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "DeepXR/Helion-V2.5-Rnd" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "DeepXR/Helion-V2.5-Rnd", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "DeepXR/Helion-V2.5-Rnd" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "DeepXR/Helion-V2.5-Rnd", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use DeepXR/Helion-V2.5-Rnd with Docker Model Runner:
docker model run hf.co/DeepXR/Helion-V2.5-Rnd
| #!/usr/bin/env python3 | |
| """ | |
| Helion-2.5-Rnd Benchmark Runner | |
| Comprehensive benchmarking suite for performance testing | |
| """ | |
| import argparse | |
| import json | |
| import logging | |
| import statistics | |
| import time | |
| from collections import defaultdict | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| from datetime import datetime | |
| from pathlib import Path | |
| from typing import Dict, List, Optional | |
| import numpy as np | |
| from inference.client import HelionClient | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| class BenchmarkRunner: | |
| """Run comprehensive benchmarks on Helion model""" | |
| def __init__( | |
| self, | |
| base_url: str = "http://localhost:8000", | |
| output_dir: str = "./benchmark_results" | |
| ): | |
| """ | |
| Initialize benchmark runner | |
| Args: | |
| base_url: API base URL | |
| output_dir: Directory for results | |
| """ | |
| self.client = HelionClient(base_url=base_url) | |
| self.output_dir = Path(output_dir) | |
| self.output_dir.mkdir(parents=True, exist_ok=True) | |
| self.results = { | |
| 'timestamp': datetime.now().isoformat(), | |
| 'base_url': base_url, | |
| 'tests': {} | |
| } | |
| def benchmark_latency( | |
| self, | |
| num_requests: int = 100, | |
| prompt_lengths: List[int] = [128, 512, 2048], | |
| max_tokens: int = 256 | |
| ) -> Dict: | |
| """ | |
| Benchmark inference latency | |
| Args: | |
| num_requests: Number of requests per test | |
| prompt_lengths: Different prompt lengths to test | |
| max_tokens: Maximum tokens to generate | |
| Returns: | |
| Latency benchmark results | |
| """ | |
| logger.info("Running latency benchmark...") | |
| results = {} | |
| for prompt_len in prompt_lengths: | |
| logger.info(f"Testing prompt length: {prompt_len}") | |
| # Generate test prompt | |
| test_prompt = "Hello world. " * (prompt_len // 13) | |
| latencies = [] | |
| first_token_latencies = [] | |
| for i in range(num_requests): | |
| try: | |
| start_time = time.time() | |
| response = self.client.complete( | |
| prompt=test_prompt, | |
| max_tokens=max_tokens, | |
| temperature=0.7, | |
| stream=False | |
| ) | |
| end_time = time.time() | |
| latency = (end_time - start_time) * 1000 # Convert to ms | |
| latencies.append(latency) | |
| if i % 10 == 0: | |
| logger.info(f" Progress: {i+1}/{num_requests}") | |
| except Exception as e: | |
| logger.error(f"Request failed: {e}") | |
| if latencies: | |
| results[f"prompt_{prompt_len}"] = { | |
| 'num_samples': len(latencies), | |
| 'mean_ms': statistics.mean(latencies), | |
| 'median_ms': statistics.median(latencies), | |
| 'std_dev_ms': statistics.stdev(latencies) if len(latencies) > 1 else 0, | |
| 'min_ms': min(latencies), | |
| 'max_ms': max(latencies), | |
| 'p50_ms': np.percentile(latencies, 50), | |
| 'p90_ms': np.percentile(latencies, 90), | |
| 'p95_ms': np.percentile(latencies, 95), | |
| 'p99_ms': np.percentile(latencies, 99) | |
| } | |
| return results | |
| def benchmark_throughput( | |
| self, | |
| duration_seconds: int = 60, | |
| concurrent_requests: int = 10, | |
| prompt_length: int = 512, | |
| max_tokens: int = 128 | |
| ) -> Dict: | |
| """ | |
| Benchmark throughput with concurrent requests | |
| Args: | |
| duration_seconds: How long to run test | |
| concurrent_requests: Number of concurrent requests | |
| prompt_length: Prompt length for testing | |
| max_tokens: Maximum tokens to generate | |
| Returns: | |
| Throughput benchmark results | |
| """ | |
| logger.info(f"Running throughput benchmark for {duration_seconds}s...") | |
| test_prompt = "The quick brown fox jumps over the lazy dog. " * (prompt_length // 45) | |
| start_time = time.time() | |
| end_time = start_time + duration_seconds | |
| completed_requests = 0 | |
| failed_requests = 0 | |
| total_tokens = 0 | |
| latencies = [] | |
| def make_request(): | |
| try: | |
| req_start = time.time() | |
| response = self.client.complete( | |
| prompt=test_prompt, | |
| max_tokens=max_tokens, | |
| temperature=0.7 | |
| ) | |
| req_end = time.time() | |
| return { | |
| 'success': True, | |
| 'latency': req_end - req_start, | |
| 'tokens': len(response.split()) # Approximate | |
| } | |
| except Exception as e: | |
| return {'success': False, 'error': str(e)} | |
| with ThreadPoolExecutor(max_workers=concurrent_requests) as executor: | |
| while time.time() < end_time: | |
| futures = [executor.submit(make_request) for _ in range(concurrent_requests)] | |
| for future in as_completed(futures): | |
| result = future.result() | |
| if result['success']: | |
| completed_requests += 1 | |
| latencies.append(result['latency'] * 1000) | |
| total_tokens += result.get('tokens', 0) | |
| else: | |
| failed_requests += 1 | |
| actual_duration = time.time() - start_time | |
| return { | |
| 'duration_seconds': actual_duration, | |
| 'concurrent_requests': concurrent_requests, | |
| 'completed_requests': completed_requests, | |
| 'failed_requests': failed_requests, | |
| 'requests_per_second': completed_requests / actual_duration, | |
| 'total_tokens': total_tokens, | |
| 'tokens_per_second': total_tokens / actual_duration, | |
| 'avg_latency_ms': statistics.mean(latencies) if latencies else 0, | |
| 'p95_latency_ms': np.percentile(latencies, 95) if latencies else 0 | |
| } | |
| def benchmark_context_length( | |
| self, | |
| context_lengths: List[int] = [1024, 4096, 16384, 65536], | |
| num_samples: int = 10 | |
| ) -> Dict: | |
| """ | |
| Benchmark performance across different context lengths | |
| Args: | |
| context_lengths: List of context lengths to test | |
| num_samples: Number of samples per length | |
| Returns: | |
| Context length benchmark results | |
| """ | |
| logger.info("Running context length benchmark...") | |
| results = {} | |
| for ctx_len in context_lengths: | |
| logger.info(f"Testing context length: {ctx_len}") | |
| # Generate long context | |
| base_text = "This is a test sentence for context length benchmarking. " | |
| long_prompt = base_text * (ctx_len // len(base_text)) | |
| long_prompt = long_prompt[:ctx_len] + "\n\nSummarize the above text:" | |
| latencies = [] | |
| for i in range(num_samples): | |
| try: | |
| start_time = time.time() | |
| response = self.client.complete( | |
| prompt=long_prompt, | |
| max_tokens=256, | |
| temperature=0.5 | |
| ) | |
| end_time = time.time() | |
| latencies.append((end_time - start_time) * 1000) | |
| except Exception as e: | |
| logger.error(f"Context length {ctx_len} failed: {e}") | |
| if latencies: | |
| results[f"context_{ctx_len}"] = { | |
| 'mean_latency_ms': statistics.mean(latencies), | |
| 'median_latency_ms': statistics.median(latencies), | |
| 'std_dev_ms': statistics.stdev(latencies) if len(latencies) > 1 else 0 | |
| } | |
| return results | |
| def benchmark_generation_quality( | |
| self, | |
| test_prompts: Optional[List[str]] = None, | |
| num_samples: int = 5 | |
| ) -> Dict: | |
| """ | |
| Benchmark generation quality with diverse prompts | |
| Args: | |
| test_prompts: Custom test prompts | |
| num_samples: Number of samples per prompt type | |
| Returns: | |
| Quality benchmark results | |
| """ | |
| logger.info("Running generation quality benchmark...") | |
| if test_prompts is None: | |
| test_prompts = [ | |
| "Explain quantum computing in simple terms:", | |
| "Write a Python function to calculate fibonacci numbers:", | |
| "Translate 'Hello, how are you?' to Spanish, French, and German:", | |
| "Solve: If x + 5 = 12, what is x?", | |
| "Write a haiku about artificial intelligence:" | |
| ] | |
| results = {} | |
| for i, prompt in enumerate(test_prompts): | |
| logger.info(f"Testing prompt {i+1}/{len(test_prompts)}") | |
| responses = [] | |
| for _ in range(num_samples): | |
| try: | |
| response = self.client.complete( | |
| prompt=prompt, | |
| max_tokens=512, | |
| temperature=0.7 | |
| ) | |
| responses.append(response) | |
| except Exception as e: | |
| logger.error(f"Generation failed: {e}") | |
| if responses: | |
| results[f"prompt_{i+1}"] = { | |
| 'prompt': prompt[:50] + "...", | |
| 'num_responses': len(responses), | |
| 'avg_length': statistics.mean([len(r) for r in responses]), | |
| 'sample_response': responses[0][:200] + "..." | |
| } | |
| return results | |
| def run_all_benchmarks(self, quick_mode: bool = False) -> Dict: | |
| """ | |
| Run all benchmark suites | |
| Args: | |
| quick_mode: Run faster with fewer samples | |
| Returns: | |
| Complete benchmark results | |
| """ | |
| logger.info("Starting comprehensive benchmark suite...") | |
| if quick_mode: | |
| logger.info("Running in quick mode (fewer samples)") | |
| # Latency benchmark | |
| logger.info("\n=== Latency Benchmark ===") | |
| self.results['tests']['latency'] = self.benchmark_latency( | |
| num_requests=20 if quick_mode else 100, | |
| prompt_lengths=[128, 512] if quick_mode else [128, 512, 2048] | |
| ) | |
| # Throughput benchmark | |
| logger.info("\n=== Throughput Benchmark ===") | |
| self.results['tests']['throughput'] = self.benchmark_throughput( | |
| duration_seconds=30 if quick_mode else 60, | |
| concurrent_requests=5 if quick_mode else 10 | |
| ) | |
| # Context length benchmark | |
| logger.info("\n=== Context Length Benchmark ===") | |
| self.results['tests']['context_length'] = self.benchmark_context_length( | |
| context_lengths=[1024, 4096] if quick_mode else [1024, 4096, 16384], | |
| num_samples=5 if quick_mode else 10 | |
| ) | |
| # Generation quality | |
| logger.info("\n=== Generation Quality Benchmark ===") | |
| self.results['tests']['generation_quality'] = self.benchmark_generation_quality( | |
| num_samples=2 if quick_mode else 5 | |
| ) | |
| return self.results | |
| def save_results(self, filename: Optional[str] = None): | |
| """ | |
| Save benchmark results to file | |
| Args: | |
| filename: Output filename | |
| """ | |
| if filename is None: | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| filename = f"benchmark_{timestamp}.json" | |
| output_path = self.output_dir / filename | |
| with open(output_path, 'w') as f: | |
| json.dump(self.results, f, indent=2) | |
| logger.info(f"Results saved to {output_path}") | |
| def print_summary(self): | |
| """Print benchmark summary""" | |
| logger.info("\n" + "="*60) | |
| logger.info("BENCHMARK SUMMARY") | |
| logger.info("="*60) | |
| if 'latency' in self.results['tests']: | |
| logger.info("\nLatency Results:") | |
| for prompt_type, metrics in self.results['tests']['latency'].items(): | |
| logger.info(f" {prompt_type}:") | |
| logger.info(f" Mean: {metrics['mean_ms']:.2f}ms") | |
| logger.info(f" P95: {metrics['p95_ms']:.2f}ms") | |
| logger.info(f" P99: {metrics['p99_ms']:.2f}ms") | |
| if 'throughput' in self.results['tests']: | |
| logger.info("\nThroughput Results:") | |
| metrics = self.results['tests']['throughput'] | |
| logger.info(f" Requests/sec: {metrics['requests_per_second']:.2f}") | |
| logger.info(f" Tokens/sec: {metrics['tokens_per_second']:.2f}") | |
| logger.info(f" Avg Latency: {metrics['avg_latency_ms']:.2f}ms") | |
| logger.info("\n" + "="*60) | |
| def main(): | |
| """Main entry point""" | |
| parser = argparse.ArgumentParser(description="Helion Benchmark Runner") | |
| parser.add_argument("--base-url", type=str, default="http://localhost:8000") | |
| parser.add_argument("--output-dir", type=str, default="./benchmark_results") | |
| parser.add_argument("--quick", action="store_true", help="Run quick benchmark") | |
| parser.add_argument("--test", type=str, choices=['latency', 'throughput', 'context', 'quality', 'all'], | |
| default='all', help="Specific test to run") | |
| args = parser.parse_args() | |
| runner = BenchmarkRunner( | |
| base_url=args.base_url, | |
| output_dir=args.output_dir | |
| ) | |
| if args.test == 'all': | |
| results = runner.run_all_benchmarks(quick_mode=args.quick) | |
| elif args.test == 'latency': | |
| results = runner.benchmark_latency(num_requests=20 if args.quick else 100) | |
| elif args.test == 'throughput': | |
| results = runner.benchmark_throughput(duration_seconds=30 if args.quick else 60) | |
| elif args.test == 'context': | |
| results = runner.benchmark_context_length(num_samples=5 if args.quick else 10) | |
| elif args.test == 'quality': | |
| results = runner.benchmark_generation_quality(num_samples=2 if args.quick else 5) | |
| runner.save_results() | |
| runner.print_summary() | |
| if __name__ == "__main__": | |
| main() |