| import os |
| import json |
| from huggingface_hub import HfApi |
| from transformers import PreTrainedTokenizer |
|
|
| class CharacterTokenizer(PreTrainedTokenizer): |
| """ |
| Simple character-level tokenizer |
| """ |
|
|
| model_input_names = ["input_ids", "attention_mask"] |
|
|
| def __init__( |
| self, |
| vocab=None, |
| unk_token="[UNK]", |
| pad_token="[PAD]", |
| bos_token="[BOS]", |
| eos_token="[EOS]", |
| sep_token="[SEP]", |
| **kwargs |
| ): |
| if vocab is None: |
| vocab = {} |
| |
| special_tokens = [ |
| unk_token, |
| pad_token, |
| bos_token, |
| eos_token, |
| sep_token, |
| ] |
| for token in special_tokens: |
| if token not in vocab: |
| vocab[token] = len(vocab) |
| self.vocab = vocab |
| self.inv_vocab = {v: k for k, v in self.vocab.items()} |
|
|
| |
| super().__init__( |
| unk_token=unk_token, |
| pad_token=pad_token, |
| bos_token=bos_token, |
| eos_token=eos_token, |
| sep_token=sep_token, |
| **kwargs |
| ) |
|
|
| @property |
| def vocab_size(self): |
| return len(self.vocab) |
|
|
| def get_vocab(self): |
| return dict(self.vocab) |
|
|
| def _tokenize(self, text): |
| return list(text) |
|
|
| def _convert_token_to_id(self, token): |
| return self.vocab.get(token, self.vocab.get(self.unk_token)) |
|
|
| def _convert_id_to_token(self, index): |
| return self.inv_vocab.get(index, self.unk_token) |
|
|
| def save_vocabulary(self, save_directory, filename_prefix=None): |
| if not os.path.isdir(save_directory): |
| os.makedirs(save_directory) |
|
|
| vocab_file = os.path.join( |
| save_directory, |
| (filename_prefix + "-" if filename_prefix else "") + "vocab.json" |
| ) |
|
|
| with open(vocab_file, "w", encoding="utf-8") as f: |
| json.dump(self.vocab, f, ensure_ascii=False) |
|
|
| return (vocab_file,) |
|
|
| def batch_encode(self, texts, add_special_tokens=False, padding=False, truncation=True, max_length=None): |
| encoded_texts = [self.encode(text) for text in texts] |
| |
| if max_length is not None: |
| encoded_texts = [ids[:max_length] for ids in encoded_texts] |
| if add_special_tokens: |
| bos_token_id = self.convert_tokens_to_ids(self.bos_token) |
| eos_token_id = self.convert_tokens_to_ids(self.eos_token) |
| encoded_texts = [[bos_token_id] + ids + [eos_token_id] for ids in encoded_texts] |
| |
| if padding: |
| |
| pad_id = self.vocab.get(self.pad_token, 0) |
| max_len = max(len(ids) for ids in encoded_texts) if max_length is None else max_length |
| if self.padding_side == "right": |
| encoded_texts = [ids + [pad_id] * (max_len - len(ids)) for ids in encoded_texts] |
| else: |
| encoded_texts = [[pad_id] * (max_len - len(ids)) + ids for ids in encoded_texts] |
| return encoded_texts |
| |
| def train(self, texts): |
| |
| vocab = {} |
| special_tokens = [ |
| self.unk_token, |
| self.pad_token, |
| self.bos_token, |
| self.eos_token, |
| self.sep_token, |
| ] |
| for token in special_tokens: |
| if token not in vocab: |
| vocab[token] = len(vocab) |
|
|
| |
| for text in texts: |
| |
| processed_text = text |
| for token in special_tokens: |
| processed_text = processed_text.replace(token, " ") |
|
|
| |
| for char in processed_text: |
| if char not in vocab: |
| vocab[char] = len(vocab) |
|
|
| self.vocab = vocab |
| self.inv_vocab = {v: k for k, v in self.vocab.items()} |
|
|
| print(f"Vocabulary built with {len(self.vocab)} tokens") |
| return self |
|
|
| def convert_tokens_to_string(self, tokens): |
| return "".join(tokens) |
| |
| @classmethod |
| def from_json(cls, vocab_file, **kwargs): |
| with open(vocab_file, 'r', encoding='utf-8') as f: |
| vocab = json.load(f) |
| return cls(vocab=vocab, **kwargs) |
| |
| @classmethod |
| def from_vocab(cls, vocab, **kwargs): |
| return cls(vocab=vocab, **kwargs) |
|
|
| @classmethod |
| def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): |
| from transformers import PreTrainedTokenizerFast |
| from transformers.utils import cached_file |
|
|
| |
| try: |
| |
| vocab_file = cached_file( |
| pretrained_model_name_or_path, |
| "vocab.json", |
| _raise_exceptions_for_missing_entries=False |
| ) |
|
|
| |
| if vocab_file: |
| return cls.from_json(vocab_file, *inputs, **kwargs) |
|
|
| except Exception as e: |
| |
| local_vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json") |
| if os.path.exists(local_vocab_file): |
| return cls.from_json(local_vocab_file, *inputs, **kwargs) |
|
|
| |
| tokenizer_files = [ |
| os.path.join(pretrained_model_name_or_path, "tokenizer.json"), |
| os.path.join(pretrained_model_name_or_path, "tokenizer_config.json") |
| ] |
|
|
| for tokenizer_file in tokenizer_files: |
| if os.path.exists(tokenizer_file): |
| print(f"Loading using PreTrainedTokenizerFast from {tokenizer_file}") |
| fast_tokenizer = PreTrainedTokenizerFast.from_pretrained(pretrained_model_name_or_path) |
| vocab = {token: i for token, i in fast_tokenizer.get_vocab().items()} |
| return cls.from_vocab(vocab, *inputs, **kwargs) |
|
|
| |
| raise ValueError( |
| f"Could not find vocab.json in {pretrained_model_name_or_path}. " |
| f"Error: {str(e)}" |
| ) |
|
|