MultiShotMaster/infer_multishot_with_recaption_example.py at main · KlingAIResearch/MultiShotMaster · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import torch
from diffsynth.utils.data import save_video
from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig
import pandas as pd
import ast
import json
import os
import argparse
from util import rgb_to_latent_shot_groups_list, pad_shot_groups_to_4n_plus_1, get_user_wanted_frames, save_video_with_caption
from openai import OpenAI


def parse_json(json_output):
    # Parsing out the markdown fencing
    lines = json_output.splitlines()
    for i, line in enumerate(lines):
        if line == "```json":
            json_output = "\n".join(lines[i+1:])  # Remove everything before "```json"
            json_output = json_output.split("```")[0]  # Remove everything after the closing "```"
            break  # Exit the loop once "```json" is found
    return json_output


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_path_json", type=str, default=None)
    parser.add_argument("--output_name", type=str, default="1.3b_recaption")
    parser.add_argument("--target_width", type=int, default=832)
    parser.add_argument("--target_height", type=int, default=480)
    parser.add_argument("--use_usp", type=bool, default=False)
    args = parser.parse_args()

    seed = 0
    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"

    # init gemini api
    client = OpenAI(
    api_key="your_api_key",
    base_url="your_api_url"
    )

    # init model
    load_path_json = args.model_path_json
    with open(load_path_json) as user_file:
        model_paths = json.load(user_file)

    pipe = WanVideoPipeline.from_pretrained(
        torch_dtype=torch.bfloat16,
        device="cuda",
        use_usp=args.use_usp,       # if use usp for inference
        model_configs=[
            ModelConfig(path=model_paths["dit"]),
            ModelConfig(path=model_paths["t5"]),
            ModelConfig(path=model_paths["vae"]),
        ],
        tokenizer_config=ModelConfig(path=model_paths["tokenizer"]),
    )


    output_dir = f"output/{args.output_name}"
    print(f"output_dir: {output_dir}")
    os.makedirs(output_dir, exist_ok=True)


    # user's customized multi-shot prompt:
    user_input = "Global scene: Medieval Castle Magic Ritual. Global style: Moonlit magic, medieval gothic. Shot 1: The wizard stands in the moonlit castle courtyard, surrounded by ancient oil lamps. Shot 2: The wizard raises the staff, its crystal tip emitting blue light. Shot 3: The crystal's light intensifies, forming a complex magical circle. Shot 4: The wizard plants the staff, cracks spreading across the ground. Shot 5: An energy vortex forms above as the staff illuminates the entire courtyard."

    shot_groups = [[0, 57], [57, 110], [110, 171], [171, 232], [232, 293]]
    padded_shot_groups, save_shot_num_list = pad_shot_groups_to_4n_plus_1(shot_groups)
    latent_shot_groups = rgb_to_latent_shot_groups_list(padded_shot_groups)

    # global prompt for recaption
    global_prompt = f"""### Task Overview:

    Your task is to read and understand the provided multi-shot caption, and rewrite new multi-shot captions in the specified output format. The provided multi-shot caption is '{user_input}'.

    ### Special Notes for global caption:
    1. You need to first describe a global caption that describe each subject's appearance and describe the video scene and style roughly in a single paragraph.
    2. Each subject's caption starts with "Subject X:", where X \in [1,2,3,...].
    3. Use no more than 20 words per description for each subject and describe only appearance in global caption.
    4. People, vehicles, animals, motor vehicles, food, and other independent objects are subjects that can be described.
    5. If there is no explicit indication, there is only one Subject in the scene.
    6. Do not describe sound.

    ### Special Notes for per-shot captions:
    1. You need to describe each shot's caption that includes the subjects' expression and actions, scene background, and camera movement in a single paragraph.
    2. Analyze which subjects in the global caption are present in the current shot. And use only existing subject numbers in story setting (e.g., 'Subject 2') to denote the visible subject. Some subjects in the story may not appear in the video.
    3. First describe the subject facing the camera, then describe the other subjects. When describing the camera position, specify which subjects are facing to the camera.
    4. Do not repeat the content in the story setting.
    5. The camera positions are different between different shots.
    6. Do not describe sound.

    ### Expected Output Format (Example 1):
    {{
        "global_caption": "Subject 1: A woman with shoulder-length brown hair wears a red, white, and black horizontally striped shirt and a dark skirt. Subject 2: A fluffy orange tabby cat with bright amber eyes, pink nose, and distinctive dark stripes across its round, chubby face. The whole scene takes place in a retro-style kitchen with patterned wallpaper and white cabinets. The visual style is modern television production, featuring clear imagery and naturalistic representation.",
        "shot0": "Subject 1 walks into the kitchen from the left, holding plates. She places a plate on the counter by the sink, then picks up food from it with a focused expression. The retro-style kitchen background includes a stove with pots, a sink with dishes, and patterned wallpaper. The camera is static, capturing Subject 1 from a side angle as she performs her actions.",
        "shot1": "Subject 1 is standing and moving slightly in a retro-style kitchen, holding a plate and appearing to be washing or handling dishes near the sink. She then moves towards a counter by the window, placing an item down. The background features patterned wallpaper, white cabinets, and a window revealing an outdoor fire escape. The camera remains static, providing a medium shot of Subject 1 and the kitchen.",
        "shot2": "Subject 1 is looking down with a focused and slightly smile expression. The background reveals a retro-style kitchen featuring white cabinets and patterned wallpaper. The camera remains static, capturing Subject 1 in a medium close-up shot.",
    }}

    ### Expected Output Format (Example 2):
    {{
        "global_caption": "Subject 1: A metallic suit of red and gold armor with glowing blue eyes and a bright arc reactor on the chest. The whole scene takes place on a coastal highland at sunset, with shimmering waves reflecting the golden light and a gentle ocean breeze. The visual style is modern television production, featuring realistic textures and cinematic lighting.",
        "shot0": "Subject 1 stands on the coastal highland with its back to the camera, admiring the setting sun. The silhouette is captured against the warm golden sky as waves crash gently below. The camera shoots from behind Subject 1, creating a contemplative and majestic atmosphere with the vast ocean horizon stretching into the distance.",
        "shot1": "Subject 1 stands in the center of the frame looking to the left. The shot captures the side profile of the upper body, clearly showing the detailed metallic surface of the iron exterior with intricate panel lines and the glowing arc reactor. The camera captures from a side angle, emphasizing the sophisticated armor design against the sunset backdrop.",
        "shot2": "A close-up shot of Subject 1's face as it looks downward with a contemplative expression. The camera remains stationary, capturing high-definition facial details including the glowing blue eyes and the refined metallic features of the helmet, with warm sunset light creating dramatic shadows across the armor's surface.",
        "shot3": "A close-up shot of Subject 1's hand from an overhead camera angle, capturing the subtle details of fingers slightly opening. The metallic texture and articulated joints of the armored glove are clearly visible, with the golden hour lighting highlighting the intricate mechanical craftsmanship."
    }}
    """

    # recaption
    response = client.chat.completions.create(
        model="gemini-2.5-flash",
        messages=[
        {
        "role": "system",
        "content": "You are an multi-shot story understanding and rewrite expert that only outputs video captions."
        },
        {"role":"user","content": global_prompt},
        ],
        temperature=0,
        timeout=120.0,
    )
    multishot_caption_str = response.choices[0].message.content.replace('\n', '').replace('  ', ' ').strip('`').replace('json', '', 1)

    if len(multishot_caption_str) < 5:
        assert False, "rewrite caption bug"

    caption_dict = ast.literal_eval(parse_json(multishot_caption_str))
    num_shots = len(latent_shot_groups)
    # global caption
    global_caption = f"Story: {caption_dict['global_caption']} "
    # per-shot captions
    now_multishot_video_caption_list = []
    for count in range(num_shots):
        now_shot_caption = global_caption + f"Now: {caption_dict[f'shot{count}']}"
        now_multishot_video_caption_list.append(now_shot_caption)

    print(now_multishot_video_caption_list)

    multishot_negative_prompt = [negative_prompt] * num_shots

    # Text-to-video
    video = pipe(
        width=args.target_width,
        height=args.target_height,
        prompt=now_multishot_video_caption_list,
        negative_prompt=multishot_negative_prompt,
        seed=seed, tiled=True,
        num_frames=latent_shot_groups[-1][-1],
        shot_groups=shot_groups,
        latent_shot_groups=latent_shot_groups,
    )
    user_wanted_frames = get_user_wanted_frames(video, padded_shot_groups, save_shot_num_list)

    # save video without caption
    save_video(user_wanted_frames, f"{output_dir}/1.mp4", fps=15, quality=5)

    # save video with caption
    save_video_with_caption(num_shots, shot_groups, now_multishot_video_caption_list, user_wanted_frames, f"{output_dir}/1_with_caption.mp4", args.target_width)

    print("Enjoy the story")
    torch.cuda.empty_cache()