OpenAI-compatible vision models support (vLLM)
I use llama-swap (docker) for OpenAI-compatible server instead of Ollama. Vision support is provided by vLLM inside docker (docker-in-docker). Here is example config
# vllm via docker
"gpt-4-vision": #"qwen2-vl-7B-gptq-int8":
aliases:
- gpt-4-vision
proxy: "http://172.17.0.1:9797"
checkEndpoint: "/health"
cmd_stop: docker stop qwen2vl
cmd: >
docker run --init --rm --runtime=nvidia --name qwen2vl
--gpus '"device=0"'
-v /root/projects/models:/models
-p 9797:8000
vllm/vllm-openai:v0.7.0
--host 0.0.0.0
--model "/models/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8"
--served-model-name gpt-4-vision
--disable-log-stats
--enforce-eager
--trust-remote-code
--gpu-memory-utilization 0.85
docker-entrypoint.sh
#!/bin/bash
set -e
# Install Docker CLI if not already installed
if ! command -v docker &> /dev/null; then
echo "Installing Docker CLI..."
apt-get update
apt-get install -y apt-transport-https ca-certificates curl gnupg lsb-release
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add -
echo 'deb [arch=amd64] https://download.docker.com/linux/ubuntu jammy stable' > /etc/apt/sources.list.d/docker.list
apt-get update
apt-get install -y docker-ce-cli
rm -rf /var/lib/apt/lists/*
echo "Docker CLI installed successfully."
else
echo "Docker CLI already installed."
fi
# Log information about Docker and the environment
echo "Docker version:"
docker --version
# Check if the specified VLLM image exists
if ! docker image inspect vllm/vllm-openai:v0.7.0 &> /dev/null; then
echo "Pulling VLLM image vllm/vllm-openai:v0.7.0..."
docker pull vllm/vllm-openai:v0.7.0
echo "VLLM image pulled successfully."
else
echo "VLLM image already exists."
fi
# Execute the original entrypoint with provided arguments
echo "Starting llama-swap with provided arguments: $@"
exec /app/llama-swap "$@"
compose.yaml
version: '3.8'
services:
llama-swap:
image: ghcr.io/mostlygeek/llama-swap:cuda
container_name: llama-swap
restart: unless-stopped
ports:
- "8080:8080" # llama-swap default port
volumes:
- ./config.yaml:/app/config.yaml
- ./docker-entrypoint.sh:/docker-entrypoint.sh
- models_volume:/models
- /var/run/docker.sock:/var/run/docker.sock # Add Docker socket mount
entrypoint: [ "/docker-entrypoint.sh", "-config", "/app/config.yaml" ]
healthcheck:
test: [ "CMD", "curl", "-f", "http://localhost:8080/" ]
interval: 30s
timeout: 10s
retries: 3
start_period: 60s
environment:
- NVIDIA_VISIBLE_DEVICES=all
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [ gpu ]
volumes:
models_volume:
driver: local
driver_opts:
type: none
o: bind
device: /root/projects/models
Dockerfile
FROM ghcr.io/mostlygeek/llama-swap:cuda
# Install Docker CLI
RUN apt-get update && \
apt-get install -y apt-transport-https ca-certificates curl gnupg lsb-release && \
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add - && \
echo 'deb [arch=amd64] https://download.docker.com/linux/ubuntu jammy stable' > /etc/apt/sources.list.d/docker.list && \
apt-get update && \
apt-get install -y docker-ce-cli && \
rm -rf /var/lib/apt/lists/*
# Use the same entrypoint and command as the base image
ENTRYPOINT ["/app/llama-swap"]
CMD ["-config", "/app/config.yaml"]
build_and_start.sh
#!/bin/bash
set -e
echo "Building custom llama-swap image with Docker CLI..."
docker build -t custom-llama-swap .
echo "Stopping any existing llama-swap container..."
docker stop llama-swap || true
docker rm llama-swap || true
echo "Starting llama-swap container..."
docker compose up -d
echo "Container is starting. You can check logs with:"
echo "docker logs -f llama-swap"
test_vision_api.py
import requests
import json
import time
import base64
import os
from urllib.parse import urlparse
# Server configuration
url = "http://llm:8080/v1/chat/completions"
model = "gpt-4-vision"
headers = {
"Content-Type": "application/json",
}
# Check if the server is up
try:
health_check = requests.get("http://llm:8080/")
print(f"Server health check status: {health_check.status_code}")
print(f"Server response: {health_check.text[:200]}...") # Print first 200 chars
except Exception as e:
print(f"Server health check failed: {e}")
# Sample images for testing
# 1. HTTP URL with proper protocol
http_image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
# 2. Function to create a base64 image
def create_sample_base64_image():
"""
Create a sample image for testing.
Returns a data URI with a base64-encoded original image.
"""
# Download the original image from http_image_url and convert it to base64
response = requests.get(http_image_url)
response.raise_for_status()
image_data = response.content
encoded_data = base64.b64encode(image_data).decode('utf-8')
return f"data:image/jpeg;base64,{encoded_data}"
# Create a base64 sample image
base64_image = create_sample_base64_image()
# Test 1: HTTP URL request
http_payload = {
"model": model,
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "What's in this image? (HTTP URL test)"
},
{
"type": "image_url",
"image_url": {
"url": http_image_url
}
}
]
}
],
"max_tokens": 300
}
# Test 2: Base64 image request
base64_payload = {
"model": model,
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "What's in this image? (Base64 test)"
},
{
"type": "image_url",
"image_url": {
"url": base64_image
}
}
]
}
],
"max_tokens": 300
}
def make_request(payload, test_name):
"""Make a request to the server with the given payload and test name."""
print(f"\n\n===== Testing {test_name} =====")
try:
print(f"Sending request to: {url}")
print(f"Payload includes image of type: {payload['messages'][0]['content'][1]['type']}")
# For base64 images, don't log the entire image data
if "image_url" in payload['messages'][0]['content'][1] and payload['messages'][0]['content'][1]['image_url']['url'].startswith('data:'):
print("Using base64 encoded image")
else:
print(f"Using image URL: {payload['messages'][0]['content'][1]['image_url']['url']}")
start_time = time.time()
response = requests.post(url, headers=headers, json=payload, timeout=60)
end_time = time.time()
print(f"Response time: {end_time - start_time:.2f} seconds")
print(f"Response status: {response.status_code}")
print(f"Response headers: {response.headers}")
try:
response_json = response.json()
print("\nResponse JSON:")
print(json.dumps(response_json, indent=2))
if "choices" in response_json and len(response_json["choices"]) > 0:
content = response_json["choices"][0]["message"]["content"]
print("\nGenerated description:")
print(content)
except json.JSONDecodeError:
print("\nNon-JSON response:")
print(response.text)
return response.status_code
except Exception as e:
print(f"Error making request: {e}")
if hasattr(e, 'response') and e.response:
print(f"Response text: {e.response.text}")
return None
# Run the tests
print("\n============= STARTING TESTS =============\n")
# Test HTTP URL
http_status = make_request(http_payload, "HTTP URL Image")
# Wait before trying the next test
time.sleep(2)
# Test Base64 Image
base64_status = make_request(base64_payload, "Base64 Encoded Image")
# Print summary
print("\n============= TEST SUMMARY =============")
print(f"HTTP URL test status: {http_status}")
print(f"Base64 image test status: {base64_status}")
print("=======================================\n")
Output: Generated description: The image depicts a serene landscape featuring a wooden boardwalk that stretches through a field of tall green grass. The boardwalk appears to be in good condition, leading the viewer's eye towards a distance where a blue sky with scattered clouds can be seen. The background includes smaller patches of greenery and a sense of openness, conveying a peaceful and natural environment. The overall scene is well-lit, suggesting it is taken during the day with natural sunlight.
How it looks in OpenWebUI:
At first glance this looks amazing! I ran out of time today, but tomorrow I'll hop in and finish reviewing/testing. I really appreciate you adding this!