| | from typing import Dict, Any, Iterable |
| | from sklearn import TfIdfVectorizer |
| | import wordcloud |
| | from pydantic import BaseModel |
| |
|
| |
|
| | class WordCloudExtractor: |
| |
|
| | tfidf_params: Dict[str, Any] |
| |
|
| | def extract_from_corpus(self, texts: Iterable[str], n_words: int) -> wordcloud.WordCloud: |
| | pass |
| | from sklearn.feature_extraction.text import TfidfVectorizer |
| | from wordcloud import WordCloud |
| | import numpy as np |
| |
|
| | class TextVisualization: |
| | @staticmethod |
| | def extract_from_corpus(texts, max_features=100): |
| | """ |
| | Extract word frequencies from a corpus using TF-IDF vectorization |
| | and generate word cloud frequencies. |
| | |
| | Args: |
| | texts: List of text documents |
| | max_features: Maximum number of words to include |
| | |
| | Returns: |
| | Dictionary of word frequencies suitable for WordCloud |
| | """ |
| | |
| | tfidf = TfidfVectorizer( |
| | max_features=max_features, |
| | stop_words='english', |
| | lowercase=True |
| | ) |
| | |
| | |
| | tfidf_matrix = tfidf.fit_transform(texts) |
| | |
| | |
| | feature_names = tfidf.get_feature_names_out() |
| | |
| | |
| | mean_tfidf = np.array(tfidf_matrix.mean(axis=0)).flatten() |
| | |
| | |
| | frequencies = dict(zip(feature_names, mean_tfidf)) |
| | |
| | return frequencies |
| |
|