SkyworkAI · 3a1b2c3 · Mar 7, 2026 · Mar 10, 2026 · Mar 10, 2026 · Mar 10, 2026
diff --git a/code/download_checkpoints.py b/code/download_checkpoints.py
@@ -1,7 +1,10 @@
+import argparse
 import os
 # os.environ["HF_ENDPOINT"] = 'https://hf-mirror.com'
-from huggingface_hub import hf_hub_download, snapshot_download
+from huggingface_hub import hf_hub_download, snapshot_download, login
 
+# Set HF_TOKEN environment variable before running this script, e.g.:
+# export HF_TOKEN=hf_...
 
 def download_ckpt(local_dir, repo_id, filename):
     os.makedirs(local_dir, exist_ok=True)
@@ -15,15 +18,61 @@ def download_ckpt(local_dir, repo_id, filename):
         print(f"File has been downloaded to: {file_path}")
     else:
         print(f"File exists already: {local_path}")
+
+
+def download_snapshot(local_dir, repo_id):
+    if os.path.isdir(local_dir) and os.listdir(local_dir):
+        print(f"Already exists: {local_dir}")
+        return
+    os.makedirs(local_dir, exist_ok=True)
+    print(f"\nDownloading snapshot {repo_id} → {local_dir}...\n")
+    snapshot_download(repo_id, local_dir=local_dir)
+    print(f"Done: {local_dir}")
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--resolution", choices=["720p", "480p", "both"], default="720p",
+                    help="Which Wan2.1-I2V model to download (720p, 480p, or both)")
+parser.add_argument("--skip_wan", action="store_true",
+                    help="Skip the large Wan2.1-I2V model download")
+parser.add_argument("--token", default=os.environ.get("HF_TOKEN"),
+                    help="HuggingFace token (falls back to HF_TOKEN env var)")
+args = parser.parse_args()
+
+if args.token:
+    login(token=args.token)
+
 os.makedirs("./checkpoints", exist_ok=True)
-repo_id_list = ["Ruicheng/moge-vitl","Iceclear/StableSR","Iceclear/StableSR","Skywork/Matrix-3D","Skywork/Matrix-3D","Skywork/Matrix-3D","Skywork/Matrix-3D","Skywork/Matrix-3D"]
-filename_list = ["model.pt","stablesr_turbo.ckpt","vqgan_cfw_00011.ckpt","checkpoints/text2panoimage_lora.safetensors","checkpoints/pano_lrm_480p.pt","checkpoints/pano_video_gen_480p.ckpt","checkpoints/pano_video_gen_720p.bin","checkpoints/pano_video_gen_720p_5b.safetensors"]
-local_dir_list = ["./checkpoints/moge","./checkpoints/StableSR","./checkpoints/StableSR","./checkpoints/flux_lora","./checkpoints/pano_lrm","./checkpoints/Wan-AI/wan_lora","./checkpoints/Wan-AI/wan_lora","./checkpoints/Wan-AI/wan_lora"]
-
-N = len(repo_id_list)
-for i in range(N):
-    repo_id = repo_id_list[i]
-    filename = filename_list[i]
-    local_dir = local_dir_list[i]
-    print(f"\nDownloading {filename} from {repo_id} to local folder {local_dir}...\n")
+
+# Small checkpoints from HF
+repo_id_list   = ["Ruicheng/moge-vitl", "Iceclear/StableSR", "Iceclear/StableSR",
+                  "Skywork/Matrix-3D", "Skywork/Matrix-3D", "Skywork/Matrix-3D",
+                  "Skywork/Matrix-3D", "Skywork/Matrix-3D"]
+filename_list  = ["model.pt", "stablesr_turbo.ckpt", "vqgan_cfw_00011.ckpt",
+                  "checkpoints/text2panoimage_lora.safetensors", "checkpoints/pano_lrm_480p.pt",
+                  "checkpoints/pano_video_gen_480p.ckpt", "checkpoints/pano_video_gen_720p.bin",
+                  "checkpoints/pano_video_gen_720p_5b.safetensors"]
+local_dir_list = ["./checkpoints/moge", "./checkpoints/StableSR", "./checkpoints/StableSR",
+                  "./checkpoints/flux_lora", "./checkpoints/pano_lrm",
+                  "./checkpoints/Wan-AI/wan_lora", "./checkpoints/Wan-AI/wan_lora",
+                  "./checkpoints/Wan-AI/wan_lora"]
+
+for repo_id, filename, local_dir in zip(repo_id_list, filename_list, local_dir_list):
+    print(f"\nDownloading {filename} from {repo_id} → {local_dir}...\n")
     download_ckpt(local_dir, repo_id, filename)
+
+# VideoLLaMA3-7B (prompt generation for i2p mode) — to HF cache
+print("\nDownloading DAMO-NLP-SG/VideoLLaMA3-7B to HF cache...\n")
+snapshot_download("DAMO-NLP-SG/VideoLLaMA3-7B")
+
+# FLUX.1-Fill-dev (gated — requires HF token + accepted terms)
+# Downloaded to HF cache (not local_dir) so from_pretrained() finds it automatically
+print("\nDownloading black-forest-labs/FLUX.1-Fill-dev to HF cache...\n")
+snapshot_download("black-forest-labs/FLUX.1-Fill-dev")
+
+# Large Wan2.1-I2V base model
+if not args.skip_wan:
+    if args.resolution in ("720p", "both"):
+        download_snapshot("./checkpoints/Wan-AI/Wan2.1-I2V-14B-720P", "Wan-AI/Wan2.1-I2V-14B-720P")
+    if args.resolution in ("480p", "both"):
+        download_snapshot("./checkpoints/Wan-AI/Wan2.1-I2V-14B-480P", "Wan-AI/Wan2.1-I2V-14B-480P")
diff --git a/code/pano_init/src/worldgen/pano_gen.py b/code/pano_init/src/worldgen/pano_gen.py
@@ -20,7 +20,11 @@ def build_pano_gen_model(lora_path=None, device="cuda"):
 def build_pano_fill_model(lora_path=None, device="cuda:0"):
     if lora_path is None:
         lora_path = hf_hub_download(repo_id="LeoXie/WorldGen", filename=f"models--WorldGen-Flux-Lora/worldgen_img2scene.safetensors")
-    pipe = FluxFillPipeline.from_pretrained("black-forest-labs/FLUX.1-Fill-dev", torch_dtype=torch.bfloat16, device=device)
+    pipe = FluxFillPipeline.from_pretrained(
+        "black-forest-labs/FLUX.1-Fill-dev",
+        dtype=torch.bfloat16,
+        low_cpu_mem_usage=True,
+    )
     print(f"Loading LoRA weights from: {lora_path}")
     pipe.load_lora_weights(lora_path)
 

diff --git a/code/panoramic_image_generation.py b/code/panoramic_image_generation.py
@@ -13,7 +13,7 @@ def create_output_dir(base_path: str, prefix: str = "example") -> str:
     os.makedirs(base_path, exist_ok=True)
     max_num = 0
     for dirname in os.listdir(base_path):
-        match = re.match(f"{prefix}(\d+)", dirname)
+        match = re.match(f"{prefix}(\\d+)", dirname)
         if match:
             max_num = max(max_num, int(match.group(1)))
     new_dir = f"{prefix}{max_num + 1}"

diff --git a/code/panoramic_image_to_video.py b/code/panoramic_image_to_video.py
@@ -360,7 +360,7 @@ def main(args):
 
     if not use_5b_model:
         #vid_path, mask_path,text,
-        tgt_resolution = (1440,720) if is_720p else (960,480)
+        tgt_resolution = (960,720) if is_720p else (960,480)
         #dset = TextVideoDataset(vid_path = os.path.join(condition_dir,"rendered_rgb.mp4"), mask_path = os.path.join(condition_dir,"rendered_mask.mp4"), text=prompt)
         # (self, vid_path, mask_path,text, max_num_frames=81, frame_interval=1, num_frames=81, height=720, width=1440, is_i2v=True):
         dset = TextVideoDataset(vid_path = os.path.join(condition_dir,"rendered_rgb.mp4"), mask_path = os.path.join(condition_dir,"rendered_mask.mp4"), text=prompt, height=tgt_resolution[1],width=tgt_resolution[0])
@@ -373,8 +373,8 @@ def main(args):
             prompt=prompt+" The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.",
             negative_prompt="The video is not of a high quality, it has a low resolution. Distortion. strange artifacts.",
             cfg_scale=5.0,
-            num_frames=81,
-            num_inference_steps=50,
+            num_frames=161,
+            num_inference_steps=5,
             seed=seed, tiled=True,
             height=tgt_resolution[1],
             width=tgt_resolution[0],
@@ -397,7 +397,7 @@ def main(args):
         dset = VideoDataset(
             #base_path="/", metadata_path="/datasets_3d/zhongqi.yang/matrix3d_inference/dataset/metadata_1k.csv",
             base_path="/", metadata_path=None,
-            num_frames=81,
+            num_frames=161,
             time_division_factor=4, time_division_remainder=1,
             max_pixels=height*width, height=height, width=width,
             height_division_factor=16, width_division_factor=16,
@@ -416,12 +416,12 @@ def main(args):
             seed=120, tiled=True,
             height=height, width=width,
             input_image=cases["video"][0],
-            num_frames=81,
+            num_frames=161,
             cond_video = (cases["cond_video"]),
             cond_mask = (cases["cond_mask"]),
         )
-        # the original resolution of 5b model is actually [704,1408], in order to be unified with latter steps, we resize the output to [720,1440].
-        video = [img.resize((1440,720)) for img in video_ori]
+        # the original resolution of 5b model is actually [704,1408], in order to be unified with latter steps, we resize the output to [960,720].
+        video = [img.resize((960,720)) for img in video_ori]
 
     if dist.get_rank() == 0:
         generated_dir = os.path.join(case_dir,"generated")

diff --git a/code/vbench_batch.py b/code/vbench_batch.py
@@ -0,0 +1,233 @@
+"""
+VBench batch generation for Matrix-3D.
+Runs the two-step pipeline (i2p panorama → video) for all scenery/indoor prompts.
+
+Usage (from Matrix-3D root):
+    python code/vbench_batch.py [--output_dir output/vbench/videos] [--num_samples 5] [--seed 0] [--resolution 720]
+"""
+import argparse
+import csv
+import json
+import os
+import random
+import glob as _glob
+import re
+import shutil
+import subprocess
+import sys
+import time
+
+import psutil
+import torch
+
+_SCRIPT_DIR   = os.path.dirname(os.path.abspath(__file__))
+_ROOT_DIR     = os.path.dirname(_SCRIPT_DIR)
+_VBENCH_DATA  = os.path.join(_ROOT_DIR, "..", "VBench", "vbench2_beta_i2v", "vbench2_beta_i2v", "data")
+_DEFAULT_JSON = os.path.join(_VBENCH_DATA, "i2v-bench-info.json")
+_DEFAULT_CROP = os.path.join(_VBENCH_DATA, "crop", "1-1")
+_CATEGORIES   = {"scenery", "indoor"}
+
+
+def _safe(text):
+    return re.sub(r'[<>:"/\\|?*]', "_", text)[:150]
+
+
+def _fmt_duration(secs):
+    h, m, s = int(secs // 3600), int(secs % 3600 // 60), int(secs % 60)
+    return f"{h:02d}h{m:02d}m{s:02d}s"
+
+
+def _sys_stats():
+    vm = psutil.virtual_memory()
+    ram_used = vm.used / 1024**3
+    ram_total = vm.total / 1024**3
+    if torch.cuda.is_available():
+        gpu_used  = torch.cuda.memory_allocated() / 1024**3
+        gpu_total = torch.cuda.get_device_properties(0).total_memory / 1024**3
+    else:
+        gpu_used = gpu_total = 0.0
+    return ram_used, ram_total, gpu_used, gpu_total
+
+
+def run(cmd, cwd=None):
+    result = subprocess.run(cmd, cwd=cwd or _ROOT_DIR)
+    return result.returncode
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output_dir",  default=os.path.join(_ROOT_DIR, "output", "vbench", "videos"))
+    parser.add_argument("--num_samples", type=int, default=5)
+    parser.add_argument("--seed",        type=int, default=0)
+    parser.add_argument("--resolution",  type=int, default=720)
+    parser.add_argument("--vbench_json", default=_DEFAULT_JSON)
+    parser.add_argument("--crop_dir",    default=_DEFAULT_CROP)
+    args = parser.parse_args()
+
+    info_json = os.path.abspath(args.vbench_json)
+    crop_dir  = os.path.abspath(args.crop_dir)
+    out_dir   = os.path.abspath(args.output_dir)
+    os.makedirs(out_dir, exist_ok=True)
+
+    stats_path   = os.path.join(os.path.dirname(out_dir), "vbench_gen_stats.csv")
+    stats_is_new = not os.path.exists(stats_path)
+    stats_f = open(stats_path, "a", newline="", encoding="utf-8")
+    stats_w = csv.writer(stats_f)
+    if stats_is_new:
+        stats_w.writerow(["timestamp", "task_idx", "prompt", "sample_idx", "seed", "duration_s",
+                          "video_count", "total_elapsed_s", "avg_s_per_video",
+                          "ram_used_gb", "ram_total_gb", "gpu_used_gb", "gpu_total_gb",
+                          "out_path", "status"])
+
+    ram_total_gb = psutil.virtual_memory().total / 1024**3
+    gpu_total_gb = torch.cuda.get_device_properties(0).total_memory / 1024**3 if torch.cuda.is_available() else 0.0
+
+    with open(info_json, encoding="utf-8") as f:
+        entries = json.load(f)
+
+    seen, prompts = set(), []
+    for e in entries:
+        name = e["file_name"]
+        if name in seen:
+            continue
+        if e.get("type") not in _CATEGORIES:
+            continue
+        seen.add(name)
+        caption = e.get("caption", os.path.splitext(name)[0])
+        prompts.append((name, caption))
+
+    total = len(prompts) * args.num_samples
+    print(f"{'='*70}")
+    print(f"[vbench] Matrix-3D VBench batch")
+    print(f"[vbench] {len(prompts)} prompts x {args.num_samples} samples = {total} videos")
+    print(f"[vbench] categories: {sorted(_CATEGORIES)}  resolution: {args.resolution}p")
+    print(f"[vbench] output → {out_dir}")
+    print(f"[vbench] stats  → {stats_path}")
+    print(f"{'='*70}")
+
+    done = skipped = generated = errors = 0
+    ok_total_s = 0.0
+    t_start = time.time()
+
+    for task_idx, (image_name, caption) in enumerate(prompts):
+        image_path = os.path.join(crop_dir, image_name)
+        if not os.path.isfile(image_path):
+            print(f"[vbench] SKIP: image not found — {image_path}")
+            continue
+
+        for sample_idx in range(args.num_samples):
+            seed     = random.randint(0, 2**31 - 1)
+            out_path = os.path.join(out_dir, f"{_safe(caption)}-{sample_idx}-{seed}.mp4")
+
+            # ── header ──────────────────────────────────────────────────────
+            elapsed   = time.time() - t_start
+            pct       = 100 * done / total if total else 0
+            eta_str   = ""
+            avg_str   = ""
+            if generated > 0:
+                avg_s   = ok_total_s / generated
+                remaining = (total - done) * avg_s
+                eta_str = f"  ETA {_fmt_duration(remaining)}"
+                avg_str = f"  avg {avg_s/60:.1f} min/video"
+            print(f"\n{'─'*70}")
+            print(f"[vbench] [{done+1}/{total}  {pct:.0f}%{eta_str}{avg_str}]  elapsed {_fmt_duration(elapsed)}")
+            print(f"[vbench] prompt {task_idx+1}/{len(prompts)}  sample {sample_idx+1}/{args.num_samples}  seed {seed}")
+            print(f"[vbench] {caption[:70]}")
+
+            existing = _glob.glob(os.path.join(out_dir, f"{_safe(caption)}-{sample_idx}-*.mp4"))
+            if existing:
+                out_path = existing[0]
+                print(f"[vbench] → SKIP (already exists)")
+                skipped += 1
+                done += 1
+                stats_w.writerow([time.strftime("%Y-%m-%dT%H:%M:%S"), task_idx, caption, sample_idx, seed,
+                                  "", generated, f"{elapsed:.1f}", "",
+                                  "", ram_total_gb, "", gpu_total_gb, out_path, "skipped"])
+                stats_f.flush()
+                continue
+
+            work_dir = os.path.join(_ROOT_DIR, "output", "vbench", "_work", f"{task_idx}_{sample_idx}")
+            os.makedirs(work_dir, exist_ok=True)
+
+            try:
+                st = time.time()
+
+                # Step 1: image → panorama
+                print(f"[vbench] step 1/2  panorama generation …")
+                t1 = time.time()
+                rc = run([
+                    sys.executable, "code/panoramic_image_generation.py",
+                    "--mode=i2p",
+                    f"--input_image_path={image_path}",
+                    f"--output_path={work_dir}",
+                    f"--seed={seed}",
+                ])
+                if rc != 0:
+                    raise RuntimeError(f"panoramic_image_generation.py exited with code {rc}")
+                print(f"[vbench] step 1/2  done  ({time.time()-t1:.0f}s)")
+
+                # Step 2: panorama → video
+                print(f"[vbench] step 2/2  video generation …")
+                t2 = time.time()
+                rc = run([
+                    sys.executable, "code/panoramic_image_to_video.py",
+                    f"--inout_dir={work_dir}",
+                    f"--resolution={args.resolution}",
+                    f"--seed={seed}",
+                ])
+                if rc != 0:
+                    raise RuntimeError(f"panoramic_image_to_video.py exited with code {rc}")
+                print(f"[vbench] step 2/2  done  ({time.time()-t2:.0f}s)")
+
+                generated_mp4 = os.path.join(work_dir, "generated", "generated.mp4")
+                if not os.path.exists(generated_mp4):
+                    raise RuntimeError(f"output video not found: {generated_mp4}")
+
+                shutil.copy2(generated_mp4, out_path)
+                shutil.rmtree(work_dir, ignore_errors=True)
+
+                ed = time.time()
+                duration = ed - st
+                ok_total_s += duration
+                generated += 1
+
+                ram_used, _, gpu_used, _ = _sys_stats()
+                total_elapsed = ed - t_start
+                avg_s_per = ok_total_s / generated
+
+                print(f"[vbench] ✓ saved  {os.path.basename(out_path)}")
+                print(f"[vbench]   duration {_fmt_duration(duration)}  |  avg {avg_s_per/60:.1f} min/video")
+                print(f"[vbench]   RAM {ram_used:.1f}/{ram_total_gb:.0f} GB  |  GPU {gpu_used:.1f}/{gpu_total_gb:.0f} GB")
+
+                stats_w.writerow([time.strftime("%Y-%m-%dT%H:%M:%S"), task_idx, caption, sample_idx, seed,
+                                  f"{duration:.1f}", generated, f"{total_elapsed:.1f}", f"{avg_s_per:.1f}",
+                                  f"{ram_used:.2f}", f"{ram_total_gb:.2f}", f"{gpu_used:.2f}", f"{gpu_total_gb:.2f}",
+                                  out_path, "ok"])
+                stats_f.flush()
+
+            except Exception as exc:
+                print(f"[vbench] ✗ ERROR: {exc}")
+                ram_used, _, gpu_used, _ = _sys_stats()
+                stats_w.writerow([time.strftime("%Y-%m-%dT%H:%M:%S"), task_idx, caption, sample_idx, seed,
+                                  "", generated, f"{time.time()-t_start:.1f}", "",
+                                  f"{ram_used:.2f}", f"{ram_total_gb:.2f}", f"{gpu_used:.2f}", f"{gpu_total_gb:.2f}",
+                                  out_path, "error"])
+                stats_f.flush()
+                errors += 1
+
+            done += 1
+
+    elapsed_total = time.time() - t_start
+    stats_f.close()
+    print(f"\n{'='*70}")
+    print(f"[vbench] DONE  generated={generated}  skipped={skipped}  errors={errors}")
+    print(f"[vbench] total elapsed: {_fmt_duration(elapsed_total)}")
+    if generated:
+        print(f"[vbench] avg per video: {ok_total_s/generated/60:.1f} min")
+    print(f"[vbench] videos → {out_dir}")
+    print(f"[vbench] stats  → {stats_path}")
+    print(f"{'='*70}")
+
+
+if __name__ == "__main__":
+    main()