Upload 7 files

Browse files

Files changed (7) hide show

README.md +78 -3
config.py +34 -0
get_embeddings.py +44 -0
model.py +36 -0
requirements.txt +5 -0
test.py +65 -0
train.py +83 -0

README.md CHANGED Viewed

@@ -1,3 +1,78 @@
----
-license: mit
----

+# CULTURE: Curve Understanding and Learning Transformer for Unique Replication Estimation
+CULTURE (Curve Understanding and Learning Transformer for Unique Replication Estimation) is a custom BERT-based model with a Masked Language Model (MLM) head for processing microbial growth curve data. The model is trained on 1x128 vector inputs representing microbial growth curves and can generate hidden state embeddings for downstream tasks in microbiology and bioinformatics.
+## Background
+Microbial growth curves are time-series data that represent the growth of microorganisms over time. Each 1x128 vector in our dataset represents a single growth curve, with 128 time points measuring microbial population or density. By applying CULTURE to this data, we aim to capture complex patterns and relationships within these growth curves, potentially enabling better analysis and prediction in microbiology research.
+## Requirements
+The project requires Python 3.7+ and the following packages:
+- PyTorch 1.9.0
+- Transformers 4.11.3
+- pandas 1.3.3
+- numpy 1.21.2
+- tqdm 4.62.3
+You can install the required packages using the provided `requirements.txt` file:
+```
+pip install -r requirements.txt
+```
+## File Structure
+- `config.py`: Contains all configurable parameters for the CULTURE model and training process.
+- `model.py`: Defines the CULTURE model architecture.
+- `train.py`: Script for training the CULTURE model.
+- `test.py`: Script for evaluating the CULTURE model's performance on the MLM task.
+- `get_embeddings.py`: Script for generating embeddings using the trained CULTURE model.
+- `requirements.txt`: List of required Python packages.
+- `train.csv`: Training data file containing microbial growth curves (not included in this repository).
+- `val.csv`: Validation data file containing microbial growth curves (not included in this repository).
+- `test.csv`: Test data file containing microbial growth curves (not included in this repository).
+## Usage
+1. Prepare your data:
+   - Ensure you have `train.csv`, `val.csv`, and `test.csv` files in the project directory.
+   - Each file should contain 1x128 vector data points representing microbial growth curves, without headers.
+2. Configure the model:
+   - Open `config.py` and adjust the hyperparameters as needed.
+   - You can modify the hidden size (`hidden_size`), number of encoder layers (`num_hidden_layers`), number of attention heads (`num_attention_heads`), and masking probability (`mlm_probability`) among other parameters.
+3. Train the model:
+   ```
+   python train.py
+   ```
+   This will train the CULTURE model on your microbial growth curve data and save it as `culture_model.pth`.
+4. Evaluate the model:
+   ```
+   python test.py
+   ```
+   This will load the trained CULTURE model and evaluate its performance on the MLM task using the test data.
+5. Generate embeddings:
+   ```
+   python get_embeddings.py input_file.csv output_embeddings.npy
+   ```
+   This will load the trained CULTURE model, process the input growth curve data, and save the embeddings as a NumPy file.
+## Customization
+- To use different input dimensions (e.g., if your growth curves have a different number of time points), modify the `input_dim` parameter in `config.py`.
+- Adjust learning rate, batch size, and other training parameters in `config.py`.
+- For more advanced modifications, you can edit the model architecture in `model.py`.
+## Output
+After running `get_embeddings.py`, you'll get a NumPy file containing the hidden state embeddings for your input data. These embeddings can be used for downstream tasks.
+## Note
+CULTURE assumes that all growth curves are sampled at consistent time intervals and have been preprocessed to have the same length (128 time points). If your data differs significantly from this format, you may need to preprocess it or adjust the model architecture accordingly.

config.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import torch
+class Config:
+    # Model parameters
+    vocab_size = 30522  # BERT vocabulary size
+    hidden_size = 128  # Adjusted to match input dimension
+    num_hidden_layers = 6  # Can be varied
+    num_attention_heads = 8  # Can be varied
+    intermediate_size = 512  # Adjusted based on hidden_size
+    hidden_act = "gelu"
+    hidden_dropout_prob = 0.1
+    attention_probs_dropout_prob = 0.1
+    max_position_embeddings = 512
+    type_vocab_size = 2
+    initializer_range = 0.02
+    layer_norm_eps = 1e-12
+    # Training parameters
+    batch_size = 32
+    learning_rate = 5e-5
+    num_train_epochs = 3
+    warmup_steps = 0
+    max_grad_norm = 1.0
+    weight_decay = 0.01
+    # Data parameters
+    train_file = "train.csv"
+    val_file = "val.csv"
+    test_file = "test.csv"
+    input_dim = 128
+    mlm_probability = 0.15  # Can be adjusted by the user
+    # Device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

get_embeddings.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import torch
+from torch.utils.data import DataLoader, TensorDataset
+import numpy as np
+from model import CustomBERTModel
+from config import Config
+import pandas as pd
+def load_data(file_path):
+    df = pd.read_csv(file_path, header=None)
+    return torch.tensor(df.values, dtype=torch.float32)
+def get_embeddings(input_file, output_file):
+    config = Config()
+    model = CustomBERTModel(config).to(config.device)
+    model.load_state_dict(torch.load("bert_mlm_model.pth"))
+    model.eval()
+    input_data = load_data(input_file)
+    dataset = TensorDataset(input_data)
+    data_loader = DataLoader(dataset, batch_size=config.batch_size)
+    all_embeddings = []
+    with torch.no_grad():
+        for batch in data_loader:
+            inputs = batch[0].to(config.device)
+            embeddings = model.get_encoder_output(inputs)
+            all_embeddings.append(embeddings.cpu().numpy())
+    all_embeddings = np.concatenate(all_embeddings, axis=0)
+    print(f"Generated embeddings shape: {all_embeddings.shape}")
+    # Save embeddings
+    np.save(output_file, all_embeddings)
+    print(f"Embeddings saved as {output_file}")
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Generate embeddings for microbial growth curves")
+    parser.add_argument("input_file", help="Path to the input CSV file containing growth curves")
+    parser.add_argument("output_file", help="Path to save the output embeddings (as .npy file)")
+    args = parser.parse_args()
+    get_embeddings(args.input_file, args.output_file)

model.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import torch
+import torch.nn as nn
+from transformers import BertConfig, BertForMaskedLM
+from config import Config
+class CustomBERTModel(nn.Module):
+    def __init__(self, config):
+        super(CustomBERTModel, self).__init__()
+        self.input_proj = nn.Linear(config.input_dim, config.hidden_size)
+        bert_config = BertConfig(
+            vocab_size=config.vocab_size,
+            hidden_size=config.hidden_size,
+            num_hidden_layers=config.num_hidden_layers,
+            num_attention_heads=config.num_attention_heads,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            hidden_dropout_prob=config.hidden_dropout_prob,
+            attention_probs_dropout_prob=config.attention_probs_dropout_prob,
+            max_position_embeddings=config.max_position_embeddings,
+            type_vocab_size=config.type_vocab_size,
+            initializer_range=config.initializer_range,
+            layer_norm_eps=config.layer_norm_eps
+        )
+        self.bert = BertForMaskedLM(bert_config)
+    def forward(self, x, labels=None):
+        x = self.input_proj(x)
+        outputs = self.bert(inputs_embeds=x, labels=labels)
+        return outputs
+    def get_encoder_output(self, x):
+        x = self.input_proj(x)
+        outputs = self.bert.bert(inputs_embeds=x)
+        return outputs.last_hidden_state

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch==1.9.0
+transformers==4.11.3
+pandas==1.3.3
+numpy==1.21.2
+tqdm==4.62.3

test.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+from torch.utils.data import DataLoader, TensorDataset
+import numpy as np
+from model import CustomBERTModel
+from config import Config
+import pandas as pd
+from tqdm import tqdm
+def load_data(file_path):
+    df = pd.read_csv(file_path, header=None)
+    return torch.tensor(df.values, dtype=torch.float32)
+def create_mlm_data(data, mlm_probability):
+    labels = data.clone()
+    probability_matrix = torch.full(labels.shape, mlm_probability)
+    masked_indices = torch.bernoulli(probability_matrix).bool()
+    labels[~masked_indices] = -100  # We only compute loss on masked tokens
+    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
+    data[indices_replaced] = 0  # Assume 0 is the representation of [MASK]
+    # 10% of the time, we replace masked input tokens with random word
+    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
+    random_words = torch.randint(Config.vocab_size, labels.shape, dtype=torch.long)
+    data[indices_random] = random_words[indices_random].float()
+    return data, labels
+def test():
+    config = Config()
+    model = CustomBERTModel(config).to(config.device)
+    model.load_state_dict(torch.load("bert_mlm_model.pth"))
+    model.eval()
+    test_data = load_data(config.test_file)
+    test_dataset = TensorDataset(test_data)
+    test_loader = DataLoader(test_dataset, batch_size=config.batch_size)
+    total_loss = 0
+    total_correct = 0
+    total_predictions = 0
+    with torch.no_grad():
+        for batch in tqdm(test_loader, desc="Testing"):
+            inputs = batch[0].to(config.device)
+            masked_inputs, labels = create_mlm_data(inputs, config.mlm_probability)
+            outputs = model(masked_inputs, labels=labels)
+            loss = outputs.loss
+            total_loss += loss.item()
+            predictions = outputs.logits.argmax(dim=-1)
+            mask = labels != -100
+            total_correct += (predictions[mask] == labels[mask]).sum().item()
+            total_predictions += mask.sum().item()
+    avg_loss = total_loss / len(test_loader)
+    accuracy = total_correct / total_predictions
+    print(f"Test Loss: {avg_loss:.4f}")
+    print(f"Test Accuracy: {accuracy:.4f}")
+if __name__ == "__main__":
+    test()

train.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader, TensorDataset
+import numpy as np
+from model import CustomBERTModel
+from config import Config
+import pandas as pd
+from tqdm import tqdm
+def load_data(file_path):
+    df = pd.read_csv(file_path, header=None)
+    return torch.tensor(df.values, dtype=torch.float32)
+def create_mlm_data(data, mlm_probability):
+    labels = data.clone()
+    probability_matrix = torch.full(labels.shape, mlm_probability)
+    masked_indices = torch.bernoulli(probability_matrix).bool()
+    labels[~masked_indices] = -100  # We only compute loss on masked tokens
+    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
+    data[indices_replaced] = 0  # Assume 0 is the representation of [MASK]
+    # 10% of the time, we replace masked input tokens with random word
+    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
+    random_words = torch.randint(Config.vocab_size, labels.shape, dtype=torch.long)
+    data[indices_random] = random_words[indices_random].float()
+    return data, labels
+def train():
+    config = Config()
+    model = CustomBERTModel(config).to(config.device)
+    optimizer = optim.AdamW(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)
+    train_data = load_data(config.train_file)
+    val_data = load_data(config.val_file)
+    train_dataset = TensorDataset(train_data)
+    val_dataset = TensorDataset(val_data)
+    train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=config.batch_size)
+    for epoch in range(config.num_train_epochs):
+        model.train()
+        total_loss = 0
+        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{config.num_train_epochs}"):
+            inputs = batch[0].to(config.device)
+            masked_inputs, labels = create_mlm_data(inputs, config.mlm_probability)
+            optimizer.zero_grad()
+            outputs = model(masked_inputs, labels=labels)
+            loss = outputs.loss
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
+            optimizer.step()
+            total_loss += loss.item()
+        avg_train_loss = total_loss / len(train_loader)
+        print(f"Epoch {epoch+1}/{config.num_train_epochs}, Average training loss: {avg_train_loss:.4f}")
+        # Validation
+        model.eval()
+        total_val_loss = 0
+        with torch.no_grad():
+            for batch in val_loader:
+                inputs = batch[0].to(config.device)
+                masked_inputs, labels = create_mlm_data(inputs, config.mlm_probability)
+                outputs = model(masked_inputs, labels=labels)
+                total_val_loss += outputs.loss.item()
+        avg_val_loss = total_val_loss / len(val_loader)
+        print(f"Validation loss: {avg_val_loss:.4f}")
+    # Save the model
+    torch.save(model.state_dict(), "bert_mlm_model.pth")
+    print("Model saved as bert_mlm_model.pth")
+if __name__ == "__main__":
+    train()