Instructions to use anthonym21/json-tokenizer-structured with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use anthonym21/json-tokenizer-structured with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="anthonym21/json-tokenizer-structured")# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("anthonym21/json-tokenizer-structured", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use anthonym21/json-tokenizer-structured with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "anthonym21/json-tokenizer-structured" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "anthonym21/json-tokenizer-structured", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/anthonym21/json-tokenizer-structured
- SGLang
How to use anthonym21/json-tokenizer-structured with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "anthonym21/json-tokenizer-structured" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "anthonym21/json-tokenizer-structured", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "anthonym21/json-tokenizer-structured" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "anthonym21/json-tokenizer-structured", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use anthonym21/json-tokenizer-structured with Docker Model Runner:
docker model run hf.co/anthonym21/json-tokenizer-structured
| """ | |
| Byte-Pair Encoding trainer and codec optimized for JSON value strings. | |
| Uses incremental pair counting with pair→word index for fast merges. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import re | |
| from collections import defaultdict | |
| from typing import Optional | |
| def _bytes_to_unicode() -> dict[int, str]: | |
| """Map bytes 0-255 to unicode chars, avoiding control/whitespace collisions.""" | |
| bs = ( | |
| list(range(ord("!"), ord("~") + 1)) | |
| + list(range(ord("¡"), ord("¬") + 1)) | |
| + list(range(ord("®"), ord("ÿ") + 1)) | |
| ) | |
| cs = bs[:] | |
| n = 0 | |
| for b in range(2**8): | |
| if b not in bs: | |
| bs.append(b) | |
| cs.append(2**8 + n) | |
| n += 1 | |
| return {b: chr(c) for b, c in zip(bs, cs)} | |
| BYTE_ENCODER = _bytes_to_unicode() | |
| BYTE_DECODER = {v: k for k, v in BYTE_ENCODER.items()} | |
| _PRE_TOK_PAT = re.compile( | |
| r"""'s|'t|'re|'ve|'m|'ll|'d| ?[a-zA-Z_]+| ?[0-9]+| ?[^\s\w]+|\s+|.""" | |
| ) | |
| class BPETrainer: | |
| """Train a BPE vocabulary from a corpus of JSON value strings.""" | |
| def __init__(self, vocab_size: int = 4096, min_frequency: int = 2): | |
| self.vocab_size = vocab_size | |
| self.min_frequency = min_frequency | |
| self.merges: list[tuple[str, str]] = [] | |
| self.vocab: dict[str, int] = {} | |
| self._id_to_tok: dict[int, str] | None = None | |
| def _pre_tokenize(self, text: str) -> list[str]: | |
| return _PRE_TOK_PAT.findall(text) | |
| def _text_to_bytes(self, text: str) -> tuple[str, ...]: | |
| return tuple(BYTE_ENCODER[b] for b in text.encode("utf-8")) | |
| def train(self, texts: list[str]) -> None: | |
| """Train BPE with pair→word index for O(affected) merges.""" | |
| # Count word frequencies | |
| word_freqs: dict[tuple[str, ...], int] = {} | |
| for text in texts: | |
| for word in self._pre_tokenize(text): | |
| bw = self._text_to_bytes(word) | |
| word_freqs[bw] = word_freqs.get(bw, 0) + 1 | |
| # Base vocab | |
| base_vocab: set[str] = set() | |
| for word in word_freqs: | |
| base_vocab.update(word) | |
| num_merges = self.vocab_size - len(base_vocab) - 1 | |
| # Word storage: idx → [symbols], freq | |
| words: list[list[str]] = [] | |
| freqs: list[int] = [] | |
| for w, f in word_freqs.items(): | |
| words.append(list(w)) | |
| freqs.append(f) | |
| # Pair counts and pair→word indices | |
| pair_counts: dict[tuple[str, str], int] = defaultdict(int) | |
| pair_to_words: dict[tuple[str, str], set[int]] = defaultdict(set) | |
| for idx, (w, f) in enumerate(zip(words, freqs)): | |
| for i in range(len(w) - 1): | |
| p = (w[i], w[i + 1]) | |
| pair_counts[p] += f | |
| pair_to_words[p].add(idx) | |
| for _ in range(max(0, num_merges)): | |
| if not pair_counts: | |
| break | |
| # Find best pair | |
| best_pair = max(pair_counts, key=pair_counts.__getitem__) | |
| if pair_counts[best_pair] < self.min_frequency: | |
| break | |
| a, b = best_pair | |
| merged = a + b | |
| self.merges.append(best_pair) | |
| # Only process words that contain this pair | |
| affected = list(pair_to_words.pop(best_pair, set())) | |
| del pair_counts[best_pair] | |
| for idx in affected: | |
| w = words[idx] | |
| f = freqs[idx] | |
| # Find positions of the pair | |
| new_w: list[str] = [] | |
| i = 0 | |
| while i < len(w): | |
| if i < len(w) - 1 and w[i] == a and w[i + 1] == b: | |
| # Decrement old adjacent pairs | |
| if new_w: | |
| old_left = (new_w[-1], a) | |
| pair_counts[old_left] -= f | |
| if pair_counts[old_left] <= 0: | |
| pair_counts.pop(old_left, None) | |
| pair_to_words[old_left].discard(idx) | |
| if i + 2 < len(w): | |
| old_right = (b, w[i + 2]) | |
| pair_counts[old_right] -= f | |
| if pair_counts[old_right] <= 0: | |
| pair_counts.pop(old_right, None) | |
| pair_to_words[old_right].discard(idx) | |
| new_w.append(merged) | |
| # Increment new adjacent pairs | |
| if len(new_w) >= 2: | |
| nl = (new_w[-2], merged) | |
| pair_counts[nl] += f | |
| pair_to_words[nl].add(idx) | |
| if i + 2 < len(w): | |
| nr = (merged, w[i + 2]) | |
| pair_counts[nr] += f | |
| pair_to_words[nr].add(idx) | |
| i += 2 | |
| else: | |
| new_w.append(w[i]) | |
| i += 1 | |
| words[idx] = new_w | |
| # Prune dead entries periodically | |
| if _ % 50 == 0: | |
| pair_counts = defaultdict(int, {k: v for k, v in pair_counts.items() if v > 0}) | |
| # Build vocab | |
| self.vocab = {} | |
| idx = 0 | |
| for ch in sorted(base_vocab): | |
| self.vocab[ch] = idx | |
| idx += 1 | |
| for merge in self.merges: | |
| m = merge[0] + merge[1] | |
| if m not in self.vocab: | |
| self.vocab[m] = idx | |
| idx += 1 | |
| self.vocab["<UNK>"] = idx | |
| self._id_to_tok = None | |
| def _apply_merge(self, word: tuple[str, ...], pair: tuple[str, str]) -> tuple[str, ...]: | |
| new: list[str] = [] | |
| i = 0 | |
| while i < len(word): | |
| if i < len(word) - 1 and word[i] == pair[0] and word[i + 1] == pair[1]: | |
| new.append(pair[0] + pair[1]) | |
| i += 2 | |
| else: | |
| new.append(word[i]) | |
| i += 1 | |
| return tuple(new) | |
| def encode_word(self, word: str) -> list[str]: | |
| bw = self._text_to_bytes(word) | |
| if len(bw) == 1: | |
| return [bw[0]] | |
| for merge in self.merges: | |
| bw = self._apply_merge(bw, merge) | |
| return list(bw) | |
| def encode(self, text: str) -> list[str]: | |
| tokens: list[str] = [] | |
| for word in self._pre_tokenize(text): | |
| tokens.extend(self.encode_word(word)) | |
| return tokens | |
| def encode_to_ids(self, text: str) -> list[int]: | |
| tokens = self.encode(text) | |
| unk_id = self.vocab.get("<UNK>", 0) | |
| return [self.vocab.get(t, unk_id) for t in tokens] | |
| def id_to_token(self, token_id: int) -> str: | |
| if self._id_to_tok is None: | |
| self._id_to_tok = {v: k for k, v in self.vocab.items()} | |
| return self._id_to_tok.get(token_id, "<UNK>") | |
| def decode_ids(self, ids: list[int]) -> str: | |
| return self.decode_tokens([self.id_to_token(i) for i in ids]) | |
| def decode_tokens(self, tokens: list[str]) -> str: | |
| byte_str = "".join(tokens) | |
| return bytearray(BYTE_DECODER.get(c, ord(c)) for c in byte_str).decode("utf-8", errors="replace") | |
| def save(self, path: str) -> None: | |
| with open(path, "w") as f: | |
| json.dump({ | |
| "version": "json-tokenizer-bpe-v1", | |
| "vocab_size": self.vocab_size, | |
| "min_frequency": self.min_frequency, | |
| "merges": [list(m) for m in self.merges], | |
| "vocab": self.vocab, | |
| }, f, indent=2) | |
| def load(cls, path: str) -> "BPETrainer": | |
| with open(path) as f: | |
| data = json.load(f) | |
| t = cls(vocab_size=data["vocab_size"], min_frequency=data["min_frequency"]) | |
| t.merges = [tuple(m) for m in data["merges"]] | |
| t.vocab = data["vocab"] | |
| t._id_to_tok = None | |
| return t | |