Text Generation
Transformers
PyTorch
Safetensors
English
rubirlm
causal-lm
base-model
1b
Mixture of Experts
Instructions to use DevHunterAI/RubiRLM-1B-Base with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use DevHunterAI/RubiRLM-1B-Base with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="DevHunterAI/RubiRLM-1B-Base")# Load model directly from transformers import RubiRLM model = RubiRLM.from_pretrained("DevHunterAI/RubiRLM-1B-Base", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use DevHunterAI/RubiRLM-1B-Base with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "DevHunterAI/RubiRLM-1B-Base" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "DevHunterAI/RubiRLM-1B-Base", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/DevHunterAI/RubiRLM-1B-Base
- SGLang
How to use DevHunterAI/RubiRLM-1B-Base with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "DevHunterAI/RubiRLM-1B-Base" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "DevHunterAI/RubiRLM-1B-Base", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "DevHunterAI/RubiRLM-1B-Base" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "DevHunterAI/RubiRLM-1B-Base", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use DevHunterAI/RubiRLM-1B-Base with Docker Model Runner:
docker model run hf.co/DevHunterAI/RubiRLM-1B-Base
| from __future__ import annotations | |
| import importlib.util | |
| from typing import Optional, Tuple | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| from xqs_stack import choose_attention_backend, choose_quant_backend | |
| from xqs_triton_ops import triton_ternary_linear | |
| _HAS_FLASH_ATTN = importlib.util.find_spec("flash_attn") is not None | |
| if _HAS_FLASH_ATTN: | |
| from flash_attn import flash_attn_func | |
| _ATTN_BACKEND = choose_attention_backend(prefer_flash=True) | |
| _QUANT_BACKEND = choose_quant_backend(prefer_triton=True) | |
| def ternary_quantize(weight: torch.Tensor) -> torch.Tensor: | |
| scale = weight.detach().abs().mean().clamp(min=1e-6) | |
| pos = weight > (0.5 * scale) | |
| neg = weight < (-0.5 * scale) | |
| quantized = torch.zeros_like(weight) | |
| quantized = torch.where(pos, torch.ones_like(weight), quantized) | |
| quantized = torch.where(neg, -torch.ones_like(weight), quantized) | |
| quantized = quantized * scale | |
| return weight + (quantized - weight).detach() | |
| class TernaryLinear(nn.Module): | |
| def __init__(self, in_features: int, out_features: int, bias: bool = True): | |
| super().__init__() | |
| self.in_features = in_features | |
| self.out_features = out_features | |
| self.backend = _QUANT_BACKEND | |
| self.weight = nn.Parameter(torch.empty(out_features, in_features)) | |
| if bias: | |
| self.bias = nn.Parameter(torch.empty(out_features)) | |
| else: | |
| self.register_parameter("bias", None) | |
| self.reset_parameters() | |
| def reset_parameters(self) -> None: | |
| nn.init.kaiming_uniform_(self.weight, a=5 ** 0.5) | |
| if self.bias is not None: | |
| bound = 1 / max(1, self.in_features) ** 0.5 | |
| nn.init.uniform_(self.bias, -bound, bound) | |
| def forward(self, x: torch.Tensor) -> torch.Tensor: | |
| if self.backend == "triton": | |
| return triton_ternary_linear(x, self.weight, self.bias) | |
| return F.linear(x, ternary_quantize(self.weight), self.bias) | |
| def build_linear(in_features: int, out_features: int, bias: bool = True, ternary: bool = False) -> nn.Module: | |
| if ternary: | |
| return TernaryLinear(in_features, out_features, bias=bias) | |
| return nn.Linear(in_features, out_features, bias=bias) | |
| def fused_residual_add(x: torch.Tensor, residual: torch.Tensor, gate: Optional[torch.Tensor] = None) -> torch.Tensor: | |
| if gate is None: | |
| return x + residual | |
| return x + (gate * residual) | |
| def causal_scaled_dot_product_attention( | |
| q: torch.Tensor, | |
| k: torch.Tensor, | |
| v: torch.Tensor, | |
| dropout_p: float = 0.0, | |
| training: bool = False, | |
| ) -> torch.Tensor: | |
| if _ATTN_BACKEND == "flash_attn" and _HAS_FLASH_ATTN and q.is_cuda and q.dtype in {torch.float16, torch.bfloat16}: | |
| q_flash = q.transpose(1, 2).contiguous() | |
| k_flash = k.transpose(1, 2).contiguous() | |
| v_flash = v.transpose(1, 2).contiguous() | |
| out = flash_attn_func( | |
| q_flash, | |
| k_flash, | |
| v_flash, | |
| dropout_p=dropout_p if training else 0.0, | |
| causal=True, | |
| ) | |
| return out.transpose(1, 2).contiguous() | |
| if hasattr(F, "scaled_dot_product_attention"): | |
| return F.scaled_dot_product_attention( | |
| q, | |
| k, | |
| v, | |
| attn_mask=None, | |
| dropout_p=dropout_p if training else 0.0, | |
| is_causal=True, | |
| ) | |
| scale = q.size(-1) ** -0.5 | |
| scores = torch.matmul(q, k.transpose(-2, -1)) * scale | |
| causal_mask = torch.triu(torch.ones(scores.size(-2), scores.size(-1), device=scores.device, dtype=torch.bool), diagonal=1) | |
| scores = scores.masked_fill(causal_mask, float("-inf")) | |
| probs = torch.softmax(scores, dim=-1) | |
| if training and dropout_p > 0: | |
| probs = F.dropout(probs, p=dropout_p) | |
| return torch.matmul(probs, v) | |
| def pack_rows(indices: torch.Tensor, *tensors: torch.Tensor) -> Tuple[torch.Tensor, ...]: | |
| return tuple(t.index_select(0, indices) for t in tensors) | |
| def scatter_rows(base: torch.Tensor, indices: torch.Tensor, updates: torch.Tensor) -> torch.Tensor: | |
| if indices.numel() == 0: | |
| return base | |
| out = base.clone() | |
| out.index_copy_(0, indices, updates) | |
| return out | |
| def maybe_compile_module(module: nn.Module, enabled: bool) -> nn.Module: | |
| if not enabled: | |
| return module | |
| compile_fn = getattr(torch, "compile", None) | |
| if compile_fn is None: | |
| return module | |
| try: | |
| return compile_fn(module) | |
| except Exception: | |
| return module | |