Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 60 additions & 11 deletions code/download_checkpoints.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import argparse
import os
# os.environ["HF_ENDPOINT"] = 'https://hf-mirror.com'
from huggingface_hub import hf_hub_download, snapshot_download
from huggingface_hub import hf_hub_download, snapshot_download, login

# Set HF_TOKEN environment variable before running this script, e.g.:
# export HF_TOKEN=hf_...

def download_ckpt(local_dir, repo_id, filename):
os.makedirs(local_dir, exist_ok=True)
Expand All @@ -15,15 +18,61 @@ def download_ckpt(local_dir, repo_id, filename):
print(f"File has been downloaded to: {file_path}")
else:
print(f"File exists already: {local_path}")


def download_snapshot(local_dir, repo_id):
if os.path.isdir(local_dir) and os.listdir(local_dir):
print(f"Already exists: {local_dir}")
return
os.makedirs(local_dir, exist_ok=True)
print(f"\nDownloading snapshot {repo_id} → {local_dir}...\n")
snapshot_download(repo_id, local_dir=local_dir)
print(f"Done: {local_dir}")


parser = argparse.ArgumentParser()
parser.add_argument("--resolution", choices=["720p", "480p", "both"], default="720p",
help="Which Wan2.1-I2V model to download (720p, 480p, or both)")
parser.add_argument("--skip_wan", action="store_true",
help="Skip the large Wan2.1-I2V model download")
parser.add_argument("--token", default=os.environ.get("HF_TOKEN"),
help="HuggingFace token (falls back to HF_TOKEN env var)")
args = parser.parse_args()

if args.token:
login(token=args.token)

os.makedirs("./checkpoints", exist_ok=True)
repo_id_list = ["Ruicheng/moge-vitl","Iceclear/StableSR","Iceclear/StableSR","Skywork/Matrix-3D","Skywork/Matrix-3D","Skywork/Matrix-3D","Skywork/Matrix-3D","Skywork/Matrix-3D"]
filename_list = ["model.pt","stablesr_turbo.ckpt","vqgan_cfw_00011.ckpt","checkpoints/text2panoimage_lora.safetensors","checkpoints/pano_lrm_480p.pt","checkpoints/pano_video_gen_480p.ckpt","checkpoints/pano_video_gen_720p.bin","checkpoints/pano_video_gen_720p_5b.safetensors"]
local_dir_list = ["./checkpoints/moge","./checkpoints/StableSR","./checkpoints/StableSR","./checkpoints/flux_lora","./checkpoints/pano_lrm","./checkpoints/Wan-AI/wan_lora","./checkpoints/Wan-AI/wan_lora","./checkpoints/Wan-AI/wan_lora"]

N = len(repo_id_list)
for i in range(N):
repo_id = repo_id_list[i]
filename = filename_list[i]
local_dir = local_dir_list[i]
print(f"\nDownloading {filename} from {repo_id} to local folder {local_dir}...\n")

# Small checkpoints from HF
repo_id_list = ["Ruicheng/moge-vitl", "Iceclear/StableSR", "Iceclear/StableSR",
"Skywork/Matrix-3D", "Skywork/Matrix-3D", "Skywork/Matrix-3D",
"Skywork/Matrix-3D", "Skywork/Matrix-3D"]
filename_list = ["model.pt", "stablesr_turbo.ckpt", "vqgan_cfw_00011.ckpt",
"checkpoints/text2panoimage_lora.safetensors", "checkpoints/pano_lrm_480p.pt",
"checkpoints/pano_video_gen_480p.ckpt", "checkpoints/pano_video_gen_720p.bin",
"checkpoints/pano_video_gen_720p_5b.safetensors"]
local_dir_list = ["./checkpoints/moge", "./checkpoints/StableSR", "./checkpoints/StableSR",
"./checkpoints/flux_lora", "./checkpoints/pano_lrm",
"./checkpoints/Wan-AI/wan_lora", "./checkpoints/Wan-AI/wan_lora",
"./checkpoints/Wan-AI/wan_lora"]

for repo_id, filename, local_dir in zip(repo_id_list, filename_list, local_dir_list):
print(f"\nDownloading {filename} from {repo_id} → {local_dir}...\n")
download_ckpt(local_dir, repo_id, filename)

# VideoLLaMA3-7B (prompt generation for i2p mode) — to HF cache
print("\nDownloading DAMO-NLP-SG/VideoLLaMA3-7B to HF cache...\n")
snapshot_download("DAMO-NLP-SG/VideoLLaMA3-7B")

# FLUX.1-Fill-dev (gated — requires HF token + accepted terms)
# Downloaded to HF cache (not local_dir) so from_pretrained() finds it automatically
print("\nDownloading black-forest-labs/FLUX.1-Fill-dev to HF cache...\n")
snapshot_download("black-forest-labs/FLUX.1-Fill-dev")

# Large Wan2.1-I2V base model
if not args.skip_wan:
if args.resolution in ("720p", "both"):
download_snapshot("./checkpoints/Wan-AI/Wan2.1-I2V-14B-720P", "Wan-AI/Wan2.1-I2V-14B-720P")
if args.resolution in ("480p", "both"):
download_snapshot("./checkpoints/Wan-AI/Wan2.1-I2V-14B-480P", "Wan-AI/Wan2.1-I2V-14B-480P")
6 changes: 5 additions & 1 deletion code/pano_init/src/worldgen/pano_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,11 @@ def build_pano_gen_model(lora_path=None, device="cuda"):
def build_pano_fill_model(lora_path=None, device="cuda:0"):
if lora_path is None:
lora_path = hf_hub_download(repo_id="LeoXie/WorldGen", filename=f"models--WorldGen-Flux-Lora/worldgen_img2scene.safetensors")
pipe = FluxFillPipeline.from_pretrained("black-forest-labs/FLUX.1-Fill-dev", torch_dtype=torch.bfloat16, device=device)
pipe = FluxFillPipeline.from_pretrained(
"black-forest-labs/FLUX.1-Fill-dev",
dtype=torch.bfloat16,
low_cpu_mem_usage=True,
)
print(f"Loading LoRA weights from: {lora_path}")
pipe.load_lora_weights(lora_path)

Expand Down
2 changes: 1 addition & 1 deletion code/panoramic_image_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def create_output_dir(base_path: str, prefix: str = "example") -> str:
os.makedirs(base_path, exist_ok=True)
max_num = 0
for dirname in os.listdir(base_path):
match = re.match(f"{prefix}(\d+)", dirname)
match = re.match(f"{prefix}(\\d+)", dirname)
if match:
max_num = max(max_num, int(match.group(1)))
new_dir = f"{prefix}{max_num + 1}"
Expand Down
14 changes: 7 additions & 7 deletions code/panoramic_image_to_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,7 @@ def main(args):

if not use_5b_model:
#vid_path, mask_path,text,
tgt_resolution = (1440,720) if is_720p else (960,480)
tgt_resolution = (960,720) if is_720p else (960,480)
#dset = TextVideoDataset(vid_path = os.path.join(condition_dir,"rendered_rgb.mp4"), mask_path = os.path.join(condition_dir,"rendered_mask.mp4"), text=prompt)
# (self, vid_path, mask_path,text, max_num_frames=81, frame_interval=1, num_frames=81, height=720, width=1440, is_i2v=True):
dset = TextVideoDataset(vid_path = os.path.join(condition_dir,"rendered_rgb.mp4"), mask_path = os.path.join(condition_dir,"rendered_mask.mp4"), text=prompt, height=tgt_resolution[1],width=tgt_resolution[0])
Expand All @@ -373,8 +373,8 @@ def main(args):
prompt=prompt+" The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.",
negative_prompt="The video is not of a high quality, it has a low resolution. Distortion. strange artifacts.",
cfg_scale=5.0,
num_frames=81,
num_inference_steps=50,
num_frames=161,
num_inference_steps=5,
seed=seed, tiled=True,
height=tgt_resolution[1],
width=tgt_resolution[0],
Expand All @@ -397,7 +397,7 @@ def main(args):
dset = VideoDataset(
#base_path="/", metadata_path="/datasets_3d/zhongqi.yang/matrix3d_inference/dataset/metadata_1k.csv",
base_path="/", metadata_path=None,
num_frames=81,
num_frames=161,
time_division_factor=4, time_division_remainder=1,
max_pixels=height*width, height=height, width=width,
height_division_factor=16, width_division_factor=16,
Expand All @@ -416,12 +416,12 @@ def main(args):
seed=120, tiled=True,
height=height, width=width,
input_image=cases["video"][0],
num_frames=81,
num_frames=161,
cond_video = (cases["cond_video"]),
cond_mask = (cases["cond_mask"]),
)
# the original resolution of 5b model is actually [704,1408], in order to be unified with latter steps, we resize the output to [720,1440].
video = [img.resize((1440,720)) for img in video_ori]
# the original resolution of 5b model is actually [704,1408], in order to be unified with latter steps, we resize the output to [960,720].
video = [img.resize((960,720)) for img in video_ori]

if dist.get_rank() == 0:
generated_dir = os.path.join(case_dir,"generated")
Expand Down
233 changes: 233 additions & 0 deletions code/vbench_batch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
"""
VBench batch generation for Matrix-3D.
Runs the two-step pipeline (i2p panorama → video) for all scenery/indoor prompts.

Usage (from Matrix-3D root):
python code/vbench_batch.py [--output_dir output/vbench/videos] [--num_samples 5] [--seed 0] [--resolution 720]
"""
import argparse
import csv
import json
import os
import random
import glob as _glob
import re
import shutil
import subprocess
import sys
import time

import psutil
import torch

_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
_ROOT_DIR = os.path.dirname(_SCRIPT_DIR)
_VBENCH_DATA = os.path.join(_ROOT_DIR, "..", "VBench", "vbench2_beta_i2v", "vbench2_beta_i2v", "data")
_DEFAULT_JSON = os.path.join(_VBENCH_DATA, "i2v-bench-info.json")
_DEFAULT_CROP = os.path.join(_VBENCH_DATA, "crop", "1-1")
_CATEGORIES = {"scenery", "indoor"}


def _safe(text):
return re.sub(r'[<>:"/\\|?*]', "_", text)[:150]


def _fmt_duration(secs):
h, m, s = int(secs // 3600), int(secs % 3600 // 60), int(secs % 60)
return f"{h:02d}h{m:02d}m{s:02d}s"


def _sys_stats():
vm = psutil.virtual_memory()
ram_used = vm.used / 1024**3
ram_total = vm.total / 1024**3
if torch.cuda.is_available():
gpu_used = torch.cuda.memory_allocated() / 1024**3
gpu_total = torch.cuda.get_device_properties(0).total_memory / 1024**3
else:
gpu_used = gpu_total = 0.0
return ram_used, ram_total, gpu_used, gpu_total


def run(cmd, cwd=None):
result = subprocess.run(cmd, cwd=cwd or _ROOT_DIR)
return result.returncode


def main():
parser = argparse.ArgumentParser()
parser.add_argument("--output_dir", default=os.path.join(_ROOT_DIR, "output", "vbench", "videos"))
parser.add_argument("--num_samples", type=int, default=5)
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--resolution", type=int, default=720)
parser.add_argument("--vbench_json", default=_DEFAULT_JSON)
parser.add_argument("--crop_dir", default=_DEFAULT_CROP)
args = parser.parse_args()

info_json = os.path.abspath(args.vbench_json)
crop_dir = os.path.abspath(args.crop_dir)
out_dir = os.path.abspath(args.output_dir)
os.makedirs(out_dir, exist_ok=True)

stats_path = os.path.join(os.path.dirname(out_dir), "vbench_gen_stats.csv")
stats_is_new = not os.path.exists(stats_path)
stats_f = open(stats_path, "a", newline="", encoding="utf-8")
stats_w = csv.writer(stats_f)
if stats_is_new:
stats_w.writerow(["timestamp", "task_idx", "prompt", "sample_idx", "seed", "duration_s",
"video_count", "total_elapsed_s", "avg_s_per_video",
"ram_used_gb", "ram_total_gb", "gpu_used_gb", "gpu_total_gb",
"out_path", "status"])

ram_total_gb = psutil.virtual_memory().total / 1024**3
gpu_total_gb = torch.cuda.get_device_properties(0).total_memory / 1024**3 if torch.cuda.is_available() else 0.0

with open(info_json, encoding="utf-8") as f:
entries = json.load(f)

seen, prompts = set(), []
for e in entries:
name = e["file_name"]
if name in seen:
continue
if e.get("type") not in _CATEGORIES:
continue
seen.add(name)
caption = e.get("caption", os.path.splitext(name)[0])
prompts.append((name, caption))

total = len(prompts) * args.num_samples
print(f"{'='*70}")
print(f"[vbench] Matrix-3D VBench batch")
print(f"[vbench] {len(prompts)} prompts x {args.num_samples} samples = {total} videos")
print(f"[vbench] categories: {sorted(_CATEGORIES)} resolution: {args.resolution}p")
print(f"[vbench] output → {out_dir}")
print(f"[vbench] stats → {stats_path}")
print(f"{'='*70}")

done = skipped = generated = errors = 0
ok_total_s = 0.0
t_start = time.time()

for task_idx, (image_name, caption) in enumerate(prompts):
image_path = os.path.join(crop_dir, image_name)
if not os.path.isfile(image_path):
print(f"[vbench] SKIP: image not found — {image_path}")
continue

for sample_idx in range(args.num_samples):
seed = random.randint(0, 2**31 - 1)
out_path = os.path.join(out_dir, f"{_safe(caption)}-{sample_idx}-{seed}.mp4")

# ── header ──────────────────────────────────────────────────────
elapsed = time.time() - t_start
pct = 100 * done / total if total else 0
eta_str = ""
avg_str = ""
if generated > 0:
avg_s = ok_total_s / generated
remaining = (total - done) * avg_s
eta_str = f" ETA {_fmt_duration(remaining)}"
avg_str = f" avg {avg_s/60:.1f} min/video"
print(f"\n{'─'*70}")
print(f"[vbench] [{done+1}/{total} {pct:.0f}%{eta_str}{avg_str}] elapsed {_fmt_duration(elapsed)}")
print(f"[vbench] prompt {task_idx+1}/{len(prompts)} sample {sample_idx+1}/{args.num_samples} seed {seed}")
print(f"[vbench] {caption[:70]}")

existing = _glob.glob(os.path.join(out_dir, f"{_safe(caption)}-{sample_idx}-*.mp4"))
if existing:
out_path = existing[0]
print(f"[vbench] → SKIP (already exists)")
skipped += 1
done += 1
stats_w.writerow([time.strftime("%Y-%m-%dT%H:%M:%S"), task_idx, caption, sample_idx, seed,
"", generated, f"{elapsed:.1f}", "",
"", ram_total_gb, "", gpu_total_gb, out_path, "skipped"])
stats_f.flush()
continue

work_dir = os.path.join(_ROOT_DIR, "output", "vbench", "_work", f"{task_idx}_{sample_idx}")
os.makedirs(work_dir, exist_ok=True)

try:
st = time.time()

# Step 1: image → panorama
print(f"[vbench] step 1/2 panorama generation …")
t1 = time.time()
rc = run([
sys.executable, "code/panoramic_image_generation.py",
"--mode=i2p",
f"--input_image_path={image_path}",
f"--output_path={work_dir}",
f"--seed={seed}",
])
if rc != 0:
raise RuntimeError(f"panoramic_image_generation.py exited with code {rc}")
print(f"[vbench] step 1/2 done ({time.time()-t1:.0f}s)")

# Step 2: panorama → video
print(f"[vbench] step 2/2 video generation …")
t2 = time.time()
rc = run([
sys.executable, "code/panoramic_image_to_video.py",
f"--inout_dir={work_dir}",
f"--resolution={args.resolution}",
f"--seed={seed}",
])
if rc != 0:
raise RuntimeError(f"panoramic_image_to_video.py exited with code {rc}")
print(f"[vbench] step 2/2 done ({time.time()-t2:.0f}s)")

generated_mp4 = os.path.join(work_dir, "generated", "generated.mp4")
if not os.path.exists(generated_mp4):
raise RuntimeError(f"output video not found: {generated_mp4}")

shutil.copy2(generated_mp4, out_path)
shutil.rmtree(work_dir, ignore_errors=True)

ed = time.time()
duration = ed - st
ok_total_s += duration
generated += 1

ram_used, _, gpu_used, _ = _sys_stats()
total_elapsed = ed - t_start
avg_s_per = ok_total_s / generated

print(f"[vbench] ✓ saved {os.path.basename(out_path)}")
print(f"[vbench] duration {_fmt_duration(duration)} | avg {avg_s_per/60:.1f} min/video")
print(f"[vbench] RAM {ram_used:.1f}/{ram_total_gb:.0f} GB | GPU {gpu_used:.1f}/{gpu_total_gb:.0f} GB")

stats_w.writerow([time.strftime("%Y-%m-%dT%H:%M:%S"), task_idx, caption, sample_idx, seed,
f"{duration:.1f}", generated, f"{total_elapsed:.1f}", f"{avg_s_per:.1f}",
f"{ram_used:.2f}", f"{ram_total_gb:.2f}", f"{gpu_used:.2f}", f"{gpu_total_gb:.2f}",
out_path, "ok"])
stats_f.flush()

except Exception as exc:
print(f"[vbench] ✗ ERROR: {exc}")
ram_used, _, gpu_used, _ = _sys_stats()
stats_w.writerow([time.strftime("%Y-%m-%dT%H:%M:%S"), task_idx, caption, sample_idx, seed,
"", generated, f"{time.time()-t_start:.1f}", "",
f"{ram_used:.2f}", f"{ram_total_gb:.2f}", f"{gpu_used:.2f}", f"{gpu_total_gb:.2f}",
out_path, "error"])
stats_f.flush()
errors += 1

done += 1

elapsed_total = time.time() - t_start
stats_f.close()
print(f"\n{'='*70}")
print(f"[vbench] DONE generated={generated} skipped={skipped} errors={errors}")
print(f"[vbench] total elapsed: {_fmt_duration(elapsed_total)}")
if generated:
print(f"[vbench] avg per video: {ok_total_s/generated/60:.1f} min")
print(f"[vbench] videos → {out_dir}")
print(f"[vbench] stats → {stats_path}")
print(f"{'='*70}")


if __name__ == "__main__":
main()
Loading