Support Phi3-mini-128k
🚀 Model / language coverage
Support the https://huggingface.co/microsoft/Phi-3-mini-128k-instruct model. This is a tracking issue.
Dynamo is splitting this into 13 subgraphs. The good news is that examine doesn't find much for us:
- _exit_autocast of torch.amp.autocast_mode
- _enter_autocast of torch.amp.autocast_mode
- and examine notes that
TensorBase.boolis automatically registered, which is probably fine.
Issues:
- #1276
- https://github.com/NVIDIA/Fuser/issues/3228
Pitch
This is an ask from internal NVIDIA colleagues.
Minimal Repro
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, get_scheduler
from datasets import load_dataset
import thunder
import thunder.dynamo
# Define model and tokenizer
model_name = "microsoft/Phi-3-mini-128k-instruct" # Replace with the model you want to fine-tune
model = AutoModelForCausalLM.from_pretrained(
model_name,
trust_remote_code=True,
torch_dtype='auto'
)
model = torch.compile(model, dynamic=False,
backend=thunder.dynamo.ThunderCompiler())
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Add a padding token to the tokenizer
if tokenizer.pad_token is None:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer)) # Resize the model's embeddings to accommodate new tokens
# Load a smaller dataset
dataset = load_dataset("tiny_shakespeare", split='train')
# Tokenize the dataset
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True,
max_length=128)
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
# Convert the dataset to PyTorch format and specify columns to return as tensors
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
# Create PyTorch DataLoader
dataloader = DataLoader(tokenized_dataset, batch_size=1, shuffle=False)
# Define optimizer and learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
lr_scheduler = get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=num_epochs * len(dataloader),
)
# Move model to GPU if available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
# Fine-tuning loop
model.train()
for epoch in range(num_epochs):
total_loss = 0
for batch in dataloader:
# Move input tensors to device
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
# Forward pass
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
loss = outputs.loss
print(loss)
total_loss += loss.item()
# Backward pass
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Update learning rate
lr_scheduler.step()
avg_loss = total_loss / len(dataloader)
print(f"Epoch {epoch + 1}/{num_epochs} completed. Average Loss: {avg_loss:.4f}")
bytes = torch.cuda.memory.max_memory_allocated()
mb: int = 1024*1024
gb: int = 1024*1024*1024
print(f"max allocated: {bytes/1024}kB {bytes/mb}mB {bytes/gb}gB")
cc @tfogal
LitGPT supports Phi-3-mini-4k-instruct. It should be easy to add the 128k configuration.
Is the request to support HuggingFace's Transformers implementation or any implementation of this Phi 3 model?
Is the request to support HuggingFace's Transformers implementation or any implementation of this Phi 3 model?
HuggingFace's transformer's implementation
This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. Thank you for your contributions.