| import os |
| import argparse |
| import torch |
| from torch.utils.data import Dataset, DataLoader |
| from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig |
| from tqdm import tqdm |
| import pandas as pd |
| import torch.nn.functional as F |
|
|
| class CSVDataset(Dataset): |
| def __init__(self, filepath, tokenizer, seq_length, rows_per_sample): |
| self.data = pd.read_csv(filepath) |
| self.text_data = self.data['Text'].tolist() |
| self.tokenizer = tokenizer |
| self.seq_length = seq_length |
| self.rows_per_sample = rows_per_sample |
|
|
| |
| self.CAP_SAMPLE_LEN = 17500 |
|
|
| if self.tokenizer.eos_token is None: |
| self.tokenizer.add_special_tokens({'eos_token': '<|endoftext|>'}) |
| |
| if self.tokenizer.pad_token is None: |
| self.tokenizer.add_special_tokens({'pad_token': '<|pad|>'}) |
|
|
| self.eos_token_id = self.tokenizer.eos_token_id |
| self.pad_token_id = self.tokenizer.pad_token_id |
|
|
| def __len__(self): |
| return (len(self.text_data) + self.rows_per_sample - 1) // self.rows_per_sample |
|
|
| def __getitem__(self, idx): |
| start_idx = idx * self.rows_per_sample |
| end_idx = min(start_idx + self.rows_per_sample, len(self.text_data)) |
|
|
| lines = self.text_data[start_idx:end_idx] |
|
|
| |
| truncated_lines = [] |
| for text in lines: |
| if len(text) > self.CAP_SAMPLE_LEN: |
| l = text.rfind(' ', 0, self.CAP_SAMPLE_LEN) |
| if l < 0: |
| l = self.CAP_SAMPLE_LEN |
| text = text[:l] |
| truncated_lines.append(text) |
|
|
| |
| |
| batch_encodings = self.tokenizer( |
| truncated_lines, |
| add_special_tokens=False, |
| truncation=True, |
| max_length=self.seq_length - 2, |
| return_tensors=None |
| ) |
|
|
| |
| input_ids_list = [] |
| for tokens in batch_encodings["input_ids"]: |
| |
| tokens.append(self.eos_token_id) |
| input_ids_list.extend(tokens) |
|
|
| |
| |
| if input_ids_list[-1] != self.eos_token_id: |
| input_ids_list.append(self.eos_token_id) |
|
|
| |
| if len(input_ids_list) > self.seq_length: |
| |
| tokens_to_remove = len(input_ids_list) - self.seq_length |
| input_ids_list = input_ids_list[:-tokens_to_remove] |
| |
| if input_ids_list[-1] != self.eos_token_id: |
| input_ids_list[-1] = self.eos_token_id |
| elif len(input_ids_list) < self.seq_length: |
| |
| padding_length = self.seq_length - len(input_ids_list) |
| input_ids_list.extend([self.pad_token_id] * padding_length) |
| |
| input_ids_list[-1] = self.eos_token_id |
|
|
| input_ids = torch.tensor(input_ids_list, dtype=torch.long) |
| return input_ids |
|
|
|
|
| def evaluate_model(model, dataloader, device): |
| """ |
| Evaluate the model batch by batch and print the losses for each batch. |
| """ |
| model.eval() |
| total_loss = 0 |
|
|
| with torch.no_grad(): |
| for batch_idx, input_ids in enumerate(tqdm(dataloader, desc="Evaluating Model")): |
| input_ids = input_ids.to(device) |
|
|
| |
| outputs = model(input_ids, labels=input_ids) |
| loss = outputs.loss.item() |
| total_loss += loss |
|
|
| |
| print(f"Batch {batch_idx + 1} Loss: {loss:.4f}") |
|
|
| avg_loss = total_loss / len(dataloader) |
| return avg_loss |
|
|
|
|
| def evaluate_single_model(model_path, tokenizer_path, csv_path, seq_length, batch_size, device): |
| """ |
| Evaluate a single model on the dataset and print losses for each batch. |
| """ |
| tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) |
| dataset = CSVDataset(csv_path, tokenizer, seq_length, rows_per_sample=50) |
| dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=4) |
|
|
| |
| |
| |
| |
|
|
| |
| model = AutoModelForCausalLM.from_pretrained( |
| model_path, |
| |
| torch_dtype=torch.float16, |
| |
| ).to(device) |
|
|
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| print(model.dtype) |
|
|
| |
| |
|
|
| print("Evaluating Model...") |
| avg_loss = evaluate_model(model, dataloader, device) |
| print(f"Average Loss: {avg_loss:.4f}") |
|
|
| return avg_loss |
|
|
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--model_path", type=str, required=True, help="Path to the model.") |
| parser.add_argument("--tokenizer_path", type=str, required=True, help="Path to the tokenizer.") |
| parser.add_argument("--csv_path", type=str, required=True, help="Path to the CSV file with 'Text' column.") |
| parser.add_argument("--seq_length", type=int, default=4096, help="Maximum sequence length.") |
| parser.add_argument("--batch_size", type=int, default=2, help="Batch size for evaluation.") |
| parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device to use.") |
|
|
| args = parser.parse_args() |
|
|
| evaluate_single_model( |
| args.model_path, |
| args.tokenizer_path, |
| args.csv_path, |
| args.seq_length, |
| args.batch_size, |
| args.device |
| ) |
|
|