| | import os |
| | import numpy as np |
| | from transformers import AutoTokenizer |
| | from tqdm import tqdm |
| |
|
| | def process_data(): |
| | |
| | input_file_path = "data/raw/merged_text/corpus.txt" |
| | tokenizer_path = "Tokenizer/BPE" |
| | output_dir = "data/bin" |
| | val_split_ratio = 0.1 |
| |
|
| | os.makedirs(output_dir, exist_ok=True) |
| |
|
| | |
| | print(f"Loading tokenizer from {tokenizer_path}...") |
| | tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) |
| | |
| | |
| | eos_id = tokenizer.eos_token_id |
| | print(f"Vocab size: {tokenizer.vocab_size}") |
| | print(f"EOS ID: {eos_id}") |
| |
|
| | |
| | print(f"Reading {input_file_path}...") |
| | with open(input_file_path, 'r', encoding='utf-8') as f: |
| | |
| | lines = f.readlines() |
| | |
| | print(f"Total lines: {len(lines):,}") |
| |
|
| | |
| | |
| | |
| | print("Tokenizing...") |
| | all_tokens = [] |
| | |
| | |
| | for line in tqdm(lines): |
| | text = line.strip() |
| | if not text: |
| | continue |
| | |
| | |
| | |
| | tokens = tokenizer.encode(text) |
| | tokens.append(eos_id) |
| | all_tokens.extend(tokens) |
| |
|
| | token_count = len(all_tokens) |
| | print(f"Total tokens: {token_count:,}") |
| |
|
| | |
| | |
| | ids = np.array(all_tokens, dtype=np.uint16) |
| |
|
| | |
| | val_count = int(token_count * val_split_ratio) |
| | train_ids = ids[:-val_count] |
| | val_ids = ids[-val_count:] |
| |
|
| | print(f"Train tokens: {len(train_ids):,}") |
| | print(f"Val tokens: {len(val_ids):,}") |
| |
|
| | |
| | train_ids.tofile(os.path.join(output_dir, "train.bin")) |
| | val_ids.tofile(os.path.join(output_dir, "val.bin")) |
| | |
| | print(f"✅ Saved binary files to {output_dir}/") |
| |
|
| | if __name__ == "__main__": |
| | process_data() |
| |
|