| | from tokenizers import Tokenizer, models, trainers, pre_tokenizers |
| | from pathlib import Path |
| | import json |
| |
|
| | |
| | corpus_path = Path("data/corpus.txt") |
| | tokenizer_path = Path("data/tokenizer.json") |
| |
|
| | |
| | with corpus_path.open("r", encoding="utf-8") as f: |
| | lines = [line.strip() for line in f if line.strip()] |
| |
|
| | |
| | tokenizer = Tokenizer(models.BPE()) |
| | tokenizer.pre_tokenizer = pre_tokenizers.Sequence([ |
| | pre_tokenizers.Whitespace(), |
| | pre_tokenizers.Punctuation() |
| | ]) |
| |
|
| | |
| | trainer = trainers.BpeTrainer(vocab_size=5000, special_tokens=["<PAD>", "<UNK>", "<EOS>"]) |
| | tokenizer.train_from_iterator(lines, trainer) |
| |
|
| | |
| | tokenizer.save(str(tokenizer_path)) |
| |
|
| | |
| | vocab = tokenizer.get_vocab() |
| | stoi = vocab |
| | itos = {v: k for k, v in vocab.items()} |
| |
|
| | with open("data/vocab.json", "w") as f: |
| | json.dump({"stoi": stoi, "itos": itos}, f) |
| |
|