client-send-image-script/send-image.py at main · context-labs/client-send-image-script · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import json
import os
from openai import OpenAI
from argparse import ArgumentParser
from typing import Optional
import base64
import requests
import mimetypes

# Note: set INFERENCE_API_KEY to your inference API key
# You can try:
# > python send-image.py --image_url "https://drive.usercontent.google.com/u/0/uc?id=1y5bEj43a01BPF1-hPIqdV5XXHcpdjCve&export=download"
# or:
# > python send-image.py --image_filepath "./test.jpg"

def parse_args():
    parser = ArgumentParser()
    parser.add_argument("--image_url", type=str, required=False)
    parser.add_argument("--image_filepath", type=str, required=False)
    parser.add_argument("--temperature", type=float, default=0.2)
    parser.add_argument("--model", type=str, default="google/gemma-3-27b-instruct/bf-16")
    parser.add_argument("--api_key", type=str, required=False, default=os.getenv("INFERENCE_API_KEY"))
    args = parser.parse_args()

    assert args.image_url or args.image_filepath, "Must provide either image_url or image_filepath"
    assert args.api_key.strip(), "Must provide --api_key or define INFERENCE_API_KEY environment variable"

    return args


def main():
    args = parse_args()
    data_uri = create_data_uri(args.image_url, args.image_filepath)
    send_image_request(args, data_uri)


def create_data_uri(image_url : Optional[str], image_filepath : Optional[str]):
    if image_url:
        image_bytes, image_mimetype = get_image_bytes_from_image_url(image_url)
    elif image_filepath:
        image_bytes, image_mimetype = get_image_bytes_from_image_filepath(image_filepath)
    else:
        raise ValueError("Must provide either image_url or image_filepath")
    validate_image_bytes(image_bytes)
    validate_image_mimetype(image_mimetype)
    return f"data:{image_mimetype};base64,{base64.b64encode(image_bytes).decode('utf-8')}"

def get_image_bytes_from_image_url(image_url : str):
    response = requests.get(image_url)
    response.raise_for_status()
    return response.content, response.headers["Content-Type"]

def get_image_bytes_from_image_filepath(image_filepath : str):
    with open(image_filepath, "rb") as f:
        mimetype = get_mimetype_from_filepath(image_filepath)
        return f.read(), mimetype

def get_mimetype_from_filepath(filepath : str):
    return mimetypes.guess_type(filepath)[0]

def validate_image_bytes(image_bytes):
    # image must be less than 1MB
    if len(image_bytes) > 1 * 1024 * 1024:
        raise ValueError("Image must be less than 1MB")

def validate_image_mimetype(image_mimetype):
    # image/png, image/jpeg, image/jpg, image/gif, or image/webp
    valid_mimetypes = ["image/png", "image/jpeg", "image/jpg", "image/gif", "image/webp"]
    if image_mimetype not in valid_mimetypes:
        raise ValueError(f"Invalid image mimetype: {image_mimetype}, must be one of {valid_mimetypes}")

def send_image_request(args, data_uri):

    base_url = "https://api.inference.net/v1"

    client = OpenAI(
        # This is the default and can be omitted
        base_url = base_url,
        api_key = args.api_key
    )


    model_name = args.model

    system_message = """You are a JSON-only image analysis API specializing in YouTube video keyframe analysis. You must respond with a valid JSON object exactly matching this structure:

    {
        "description": "2-3 sentences describing what is happening in this video keyframe",
        "objects": ["object1 with details", "object2 with details", ...],
        "actions": ["action1 with details", "action2 with details", ...],
        "environment": "detailed description of setting and atmosphere",
        "content_type": "Main type of content (e.g., real-world footage, video game, animation, cartoon, CGI, etc.)",
        "specific_style": "Specific visual or content style (e.g., anime, 3D animation, mobile gameplay, vlog, tutorial, news, documentary, etc.)",
        "production_quality": "Quality and production value (e.g., professional studio, amateur, high-budget, webcam, etc.)",
        "summary": "one comprehensive sentence summary"
    }

    Rules:
    - Remember this is a keyframe from a YouTube video, so focus on what's happening in this moment
    - ONLY output valid JSON, no other text
    - Maximum 10 objects
    - Maximum 5 actions
    - Keep descriptions concise but detailed
    - Be specific and consistent in categorizing content_type, specific_style, and production_quality
    - Ensure proper JSON escaping for quotes"""

    # Maximum tokens for model's response (not related to image tokens)
    response_max_tokens = 3000

    # Get model and generate analysis without streaming
    response = client.chat.completions.create(
        model=model_name,
        messages=[
            {"role": "system", "content": system_message},
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Return a JSON object analyzing this image. Follow the exact format specified. Do not include any text outside the JSON object.",
                    },
                    {"type": "image_url", "image_url": {"url": data_uri}},
                ],
            },
        ],
        stream=False,
        max_tokens=response_max_tokens,  # Maximum tokens for the model's text response
        temperature=args.temperature,
        response_format={"type": "json_object"},
    )

    print(response.model_dump())

if __name__ == "__main__":
    main()