Spaces:
Build error
Build error
| import numpy as np | |
| import pandas as pd | |
| import pytest | |
| from transformers import AutoModel, AutoTokenizer | |
| from src.nlp_models import HuggingFaceEmbeddings | |
| # import torch | |
| # import os | |
| #################################################################################################### | |
| ################################## Test the Text Embeddings Model ################################## | |
| #################################################################################################### | |
| def mock_text_data(tmp_path): | |
| """ | |
| Fixture to create a mock CSV file with text data for testing. | |
| """ | |
| data = {"description": ["Product 1 description", "Product 2 description"]} | |
| df = pd.DataFrame(data) | |
| file_path = tmp_path / "test_text_data.csv" | |
| df.to_csv(file_path, index=False) | |
| return str(file_path) | |
| def test_huggingface_embeddings_generic( | |
| model_name, expected_hidden_size, mock_text_data | |
| ): | |
| """ | |
| Generic test for loading a Hugging Face model, generating text embeddings, and saving them to a CSV file. | |
| This test ensures that: | |
| - The model and tokenizer are properly loaded from Hugging Face. | |
| - Embeddings are correctly generated for text descriptions. | |
| - Embeddings are saved in the correct format to a CSV file. | |
| Parameters: | |
| ---------- | |
| model_name : str | |
| The name of the Hugging Face model to test. | |
| expected_hidden_size : int | |
| The expected hidden size (dimensionality) of the embeddings generated by the model. | |
| mock_text_data : str | |
| Path to the mock CSV file containing text descriptions. | |
| """ | |
| # Initialize the HuggingFaceEmbeddings model with the provided model name | |
| model = HuggingFaceEmbeddings( | |
| model_name=model_name, path=mock_text_data, device="cpu" | |
| ) | |
| # Check that the tokenizer and model were loaded correctly | |
| assert isinstance( | |
| model.tokenizer, type(AutoTokenizer.from_pretrained(model_name)) | |
| ), ( | |
| f"Tokenizer should be an instance of {type(AutoTokenizer.from_pretrained(model_name))}" | |
| ) | |
| assert isinstance(model.model, type(AutoModel.from_pretrained(model_name))), ( | |
| f"Model should be an instance of {type(AutoModel.from_pretrained(model_name))}" | |
| ) | |
| # Generate embeddings for a sample text | |
| sample_text = "This is a test description." | |
| embeddings = model.get_embedding(sample_text) | |
| # Check that the embeddings are a NumPy array with the expected shape | |
| assert isinstance(embeddings, np.ndarray), "Embeddings should be a NumPy array" | |
| assert embeddings.shape == (expected_hidden_size,), ( | |
| f"Embeddings shape should be ({expected_hidden_size},), got {embeddings.shape}" | |
| ) | |
| if __name__ == "__main__": | |
| pytest.main() | |