| | --- |
| | datasets: |
| | - ddrg/named_math_formulas |
| | - ddrg/math_formula_retrieval |
| | - ddrg/math_formulas |
| | - ddrg/math_text |
| | --- |
| | Pretrained model based on [microsoft/deberta-v3-base](https://huggingface.co/microsoft/deberta-v3-base) with further mathematical pre-training. |
| |
|
| | Compared to deberta-v3-base, 300 additional mathematical LaTeX tokens have been added before the mathematical pre-training. As this additional pre-training used NSP-like tasks, a pooling layer has been added to the model (`bias` and `weight`). If you don't need this pooling layer, just use the standard transformers DeBERTa model. If you want to use the additional pooling layer like the BERT one, a wrapper class like the following may be used: |
| | ```python |
| | from typing import Mapping, Any |
| | |
| | import torch |
| | from torch import nn |
| | from transformers import DebertaV2Model, DebertaV2Tokenizer, AutoConfig, AutoTokenizer |
| | |
| | class DebertaV2ModelWithPoolingLayer: |
| | |
| | def __init__(self, pretrained_model_name): |
| | super(DebertaV2ModelWithPoolingLayer, self).__init__() |
| | |
| | # Load the Deberta model and tokenizer |
| | self.deberta = DebertaV2Model.from_pretrained(pretrained_model_name) |
| | self.tokenizer = DebertaV2Tokenizer.from_pretrained(pretrained_model_name) |
| | |
| | # Add a pooling layer (Linear + tanh activation) for the CLS token |
| | self.pooling_layer = nn.Sequential( |
| | nn.Linear(self.deberta.config.hidden_size, self.deberta.config.hidden_size), |
| | nn.Tanh() |
| | ) |
| | |
| | self.config = self.deberta.config |
| | self.embeddings = self.deberta.embeddings |
| | |
| | |
| | def forward(self, input_ids, attention_mask, *args, **kwargs): |
| | # Forward pass through the Deberta model |
| | outputs = self.deberta(input_ids, attention_mask=attention_mask, *args, **kwargs) |
| | |
| | # Extract the hidden states from the output |
| | hidden_states = outputs.last_hidden_state |
| | |
| | # Get the CLS token representation (first token) |
| | cls_token = hidden_states[:, 0, :] |
| | |
| | # Apply the pooling layer to the CLS token representation |
| | pooled_output = self.pooling_layer(cls_token) |
| | # Include the pooled_output in the output dictionary as 'pooling_layer' |
| | outputs["pooler_output"] = pooled_output |
| | |
| | return outputs |
| | |
| | def save_pretrained(self, path): |
| | # Save the model's state_dict, configuration, and tokenizer |
| | state_dict = self.deberta.state_dict() |
| | state_dict.update(self.pooling_layer[0].state_dict()) |
| | |
| | torch.save(state_dict, f"{path}/pytorch_model.bin") |
| | self.deberta.config.save_pretrained(path) |
| | self.tokenizer.save_pretrained(path) |
| | |
| | def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True): |
| | pooler_keys = ['bias', 'weight'] |
| | deberta_state_dict = {k: v for k, v in state_dict.items() if k not in pooler_keys} |
| | pooler_state_dict = {k: v for k, v in state_dict.items() if k in pooler_keys} |
| | self.deberta.load_state_dict(deberta_state_dict, strict=strict) |
| | self.pooling_layer[0].load_state_dict(pooler_state_dict) |
| | |
| | @classmethod |
| | def from_pretrained(cls, name): |
| | # Initialize the instance |
| | instance = cls(name) |
| | |
| | try: |
| | # Load the model's state_dict |
| | instance.load_state_dict(torch.load(f"{name}/pytorch_model.bin")) |
| | except FileNotFoundError: |
| | print("Could not find DeBERTa pooling layer. Initialize new values") |
| | |
| | # Load the configuration and tokenizer |
| | instance.deberta.config = AutoConfig.from_pretrained(name) |
| | instance.tokenizer = AutoTokenizer.from_pretrained(name) |
| | |
| | return instance |
| | ``` |