FlashPack by hlky · Pull Request #12700 · huggingface/diffusers

hlky · 2025-11-22T10:25:22Z

What does this PR do?

Adds support for FlashPack

FlashPack could be used as weight format only (see: load_flashpack_checkpoint) - keeping only weight format code would be a cleaner integration, the model loading is indeed faster[1] however part of the performance difference seems to be just due to the complexity of existing from_pretrained code, for example I noticed that _caching_allocator_warmup slows things down, also empty_device_cache is called in _load_pretrained_model whereas FlashPack's code doesn't thus the empty cache time is excluded from FlashPack's benchmark results.

Pipeline
Cleanup (is_flashpack_available check logging, etc)
FlashPack arguments (silent, num_streams, use_distributed_loading etc.)

Benchmark

Changes from FlashPack's version:

Use Diffusers
Remove transformers related code
Add SD v1.5

import csv
import gc
import os
import shutil
import tempfile
import time

import torch

from diffusers.models import AutoModel as DiffusersAutoModel
from huggingface_hub import snapshot_download


def test_model(
    repo_id: str,
    subfolder: str | None = None,
    accelerate_device: str | torch.device = "cuda",
    flashpack_device: str | torch.device = "cuda",
    dtype: torch.dtype | None = None,
    allow_pattern: str = "*",
) -> tuple[float, float, int]:
    """
    Test a model from a repository.
    """
    allow_patterns = [f"{subfolder}/{allow_pattern}"]
    if allow_pattern != "*":
        allow_patterns = [f"{subfolder}/{allow_pattern}", f"{subfolder}/config.json"]
    repo_dir = snapshot_download(
        repo_id, allow_patterns=None if subfolder is None else allow_patterns
    )
    model_dir = repo_dir if subfolder is None else os.path.join(repo_dir, subfolder)
    saved_flashpack_path = os.path.join(model_dir, "model.flashpack")
    saved_flashpack_config_path = os.path.join(model_dir, "flashpack_config.json")

    with tempfile.TemporaryDirectory() as tmpdir:
        # Make a new model directory with the model in it so it isn't cached
        temp_model_dir = os.path.join(tmpdir, "model")
        flashpack_dir = os.path.join(tmpdir, "flashpack")
        os.makedirs(flashpack_dir, exist_ok=True)
        print("Copying model to temporary directory")
        shutil.copytree(model_dir, temp_model_dir)

        # Load from the temporary model directory
        print("Loading model from temporary directory using from_pretrained")
        start_time = time.time()
        model = DiffusersAutoModel.from_pretrained(
            temp_model_dir,
            torch_dtype=dtype,
            device_map={"": accelerate_device},
            variant="fp16" if allow_pattern != "*" else None,
        )

        end_time = time.time()
        accelerate_time = end_time - start_time
        print(f"Time taken with from_pretrained: {accelerate_time} seconds")

        if os.path.exists(saved_flashpack_path) and os.path.exists(
            saved_flashpack_config_path
        ):
            print("Copying flashpack to temporary directory")
            shutil.copy(
                saved_flashpack_path, os.path.join(flashpack_dir, "model.flashpack")
            )
            shutil.copy(
                saved_flashpack_config_path, os.path.join(flashpack_dir, "config.json")
            )
        else:
            print("Packing model to flashpack")
            pack_start_time = time.time()
            model.save_pretrained(
                flashpack_dir,
                use_flashpack=True,
            )
            pack_end_time = time.time()
            print(
                f"Time taken with save_pretrained_flashpack: {pack_end_time - pack_start_time} seconds"
            )
            # Copy back to the original model directory
            shutil.copy(
                os.path.join(flashpack_dir, "model.flashpack"), saved_flashpack_path
            )
            shutil.copy(
                os.path.join(flashpack_dir, "config.json"), saved_flashpack_config_path
            )

        del model
        sync_and_flush()

        print("Loading model from flashpack directory using from_pretrained_flashpack")
        flashpack_start_time = time.time()
        flashpack_model = DiffusersAutoModel.from_pretrained(
            flashpack_dir,
            torch_dtype=dtype,
            device_map={"": flashpack_device},
            use_flashpack=True,
        )

        flashpack_end_time = time.time()
        flashpack_time = flashpack_end_time - flashpack_start_time
        print(f"Time taken with from_pretrained_flashpack: {flashpack_time} seconds")

        total_numel = 0
        for param in flashpack_model.parameters():
            total_numel += param.numel()

        total_bytes = total_numel * dtype.itemsize

        del flashpack_model
        sync_and_flush()

        return accelerate_time, flashpack_time, total_bytes


def test_wan_small_transformer() -> tuple[float, float, int]:
    return test_model(
        repo_id="Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
        subfolder="transformer",
        accelerate_device="cuda:0" if torch.cuda.is_available() else "cpu",
        flashpack_device="cuda:0" if torch.cuda.is_available() else "cpu",
        dtype=torch.bfloat16,
    )


def test_wan_large_transformer() -> tuple[float, float, int]:
    return test_model(
        repo_id="Wan-AI/Wan2.1-T2V-14B-Diffusers",
        subfolder="transformer",
        accelerate_device="cuda:0" if torch.cuda.is_available() else "cpu",
        flashpack_device="cuda:0" if torch.cuda.is_available() else "cpu",
        dtype=torch.bfloat16,
    )

def test_stable_diffusion_v1_5() -> tuple[float, float, int]:
    return test_model(
        repo_id="stable-diffusion-v1-5/stable-diffusion-v1-5",
        subfolder="unet",
        accelerate_device="cuda:0" if torch.cuda.is_available() else "cpu",
        flashpack_device="cuda:0" if torch.cuda.is_available() else "cpu",
        dtype=torch.float16,
        allow_pattern="*.fp16.safetensors",
    )

def test_flux_transformer() -> tuple[float, float, int]:
    return test_model(
        repo_id="black-forest-labs/FLUX.1-dev",
        subfolder="transformer",
        accelerate_device="cuda:0" if torch.cuda.is_available() else "cpu",
        flashpack_device="cuda:0" if torch.cuda.is_available() else "cpu",
        dtype=torch.bfloat16,
    )


def print_test_result(
    model_name: str,
    accelerate_time: float,
    flashpack_time: float,
    total_bytes: int,
) -> None:
    print(f"{model_name}: Accelerate time: {accelerate_time} seconds")
    print(f"{model_name}: Flashpack time: {flashpack_time} seconds")
    accelerate_gbps = (total_bytes / 1000**3) / accelerate_time
    flashpack_gbps = (total_bytes / 1000**3) / flashpack_time
    print(f"{model_name}: Accelerate GB/s: {accelerate_gbps} GB/s")
    print(f"{model_name}: Flashpack GB/s: {flashpack_gbps} GB/s")


def sync_and_flush() -> None:
    torch.cuda.empty_cache()
    gc.collect()
    os.system("sync")
    if os.geteuid() == 0:
        os.system("echo 3 | tee /proc/sys/vm/drop_caches")


if __name__ == "__main__":
    with open("benchmark_results.csv", "a") as f:
        writer = csv.writer(f)
        writer.writerow(["model", "accelerate_time", "flashpack_time", "total_bytes"])
        for i in range(10):
            for test_model_name, test_func in [
                ("Wan-AI/Wan2.1-T2V-1.3B-Diffusers", test_wan_small_transformer),
                ("stable-diffusion-v1-5/stable-diffusion-v1-5", test_stable_diffusion_v1_5),
                # ("black-forest-labs/FLUX.1-dev", test_flux_transformer),
            ]:
                accelerate_time, flashpack_time, total_bytes = test_func()
                writer.writerow(
                    [test_model_name, accelerate_time, flashpack_time, total_bytes]
                )
                print_test_result(
                    test_model_name, accelerate_time, flashpack_time, total_bytes
                )

======================================================================
SUMMARY STATISTICS
======================================================================

Model                                    Size (GB)    Accel (s)    Flash (s)    Speedup   
----------------------------------------------------------------------
Stable Diffusion v1.5  (fp16)                        1.60       0.250       0.263      0.95x
Wan2.1 1.3B DiT                                2.64       1.315       0.600      2.19x

======================================================================

Model                                    Accel GB/s      Flash GB/s     
----------------------------------------------------------------------
Stable Diffusion v1.5  (fp16)                           6.41           6.13
Wan2.1 1.3B DiT                                   2.33           4.44
======================================================================

[1] For bfloat16 - with float16 existing code appears to be faster

Who can review?

Anyone in the community is free to review the PR once the tests have passed. Feel free to tag
members/contributors who may be interested in your PR.

github-actions · 2026-01-09T15:04:59Z

This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread.

Please note that issues that do not follow the contributing guidelines are likely to be ignored.

iwr-redmond · 2026-01-30T17:23:28Z

@DN6 @sayakpaul what needs to be done prior to merging this PR?

DN6

Looking good 👍🏽 Some minor comments. Could we also add a test for saving/loading and setting device_map with a dummy model.

DN6 · 2026-02-12T11:41:19Z

src/diffusers/pipelines/pipeline_utils.py

        load_connected_pipeline = kwargs.pop("load_connected_pipeline", False)
        trust_remote_code = kwargs.pop("trust_remote_code", False)
        dduf_file: Optional[Dict[str, DDUFEntry]] = kwargs.pop("dduf_file", None)
+        use_flashpack = kwargs.pop("use_flashpack", True)


Suggested change

use_flashpack = kwargs.pop("use_flashpack", True)

use_flashpack = kwargs.pop("use_flashpack", False)

DN6 · 2026-02-12T13:33:26Z

src/diffusers/models/model_loading_utils.py

    return parsed_parameters


+def load_flashpack_checkpoint(flashpack_checkpoint_path: str):


I don't think this is used anywhere? We can remove it if that's the case.

DN6 · 2026-02-12T14:04:37Z

src/diffusers/models/modeling_utils.py

                    subfolder=subfolder or "",
                    dduf_entries=dduf_entries,
                )
+            elif use_flashpack:


We can consolidate this section into something like this

if use_flashpack: weights_name = _add_variant(FLASHPACK_WEIGHTS_NAME, variant) elif use_safetensors: weights_name = _add_variant(SAFETENSORS_WEIGHTS_NAME, variant) else: weights_name = None if weights_name is not None: try: resolved_model_file = _get_model_file( pretrained_model_name_or_path, weights_name=weights_name, **model_file_kwargs, ) except IOError as e: logger.error(f"An error occurred while trying to fetch {pretrained_model_name_or_path}: {e}") if not allow_pickle: raise logger.warning( "Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead." )

DN6 · 2026-02-12T17:16:57Z

src/diffusers/models/modeling_utils.py

+                )
+                flashpack_device = None
+            else:
+                flashpack_device = device_map[""]


Suggested change

flashpack_device = device_map[""]

device = device_map[""]

flashpack_device = torch.device(device) if not isinstance(device, torch.device) else device

DN6 · 2026-02-12T17:18:10Z

src/diffusers/models/modeling_utils.py

+                flashpack_device = device_map[""]
+                if flashpack_device in ["auto", "balanced", "balanced_low_0", "sequential"]:
+                    raise ValueError(
+                        "FlashPack `device_map` should be a device, not one of `auto`, `balanced`, `balanced_low_0`, `sequential`."


Suggested change

"FlashPack `device_map` should be a device, not one of `auto`, `balanced`, `balanced_low_0`, `sequential`."

"FlashPack `device_map` should not be one of `auto`, `balanced`, `balanced_low_0`, `sequential`. Use a specific device instead, e.g., `device_map='cuda'` or `device_map='cuda:0'"

DN6 · 2026-02-12T17:41:22Z

src/diffusers/models/modeling_utils.py

+                device=flashpack_device,
+                **flashpack_kwargs,
+            )
+


Also need to reset dtype_orig since this early exits.

Suggested change

if dtype_orig is not None:

torch.set_default_dtype(dtype_orig)

hlky added 11 commits November 22, 2025 08:59

FlashPack

a6ffed6

setup

55ea7ce

save_pretrained

0446652

dtype is property

8cf75a8

destination_path

059818f

logging

580b3c4

pipeline

afd115b

ruff

e4d1553

flashpack_kwargs

7ba7f0b

Merge branch 'main' into flashpack

a218be3

download

12be157

github-actions bot added the stale Issues that haven't received updates label Jan 9, 2026

Merge branch 'main' into flashpack

95d0faa

hlky marked this pull request as ready for review January 19, 2026 11:51

Fix docstring

8c36fa9

DN6 removed the stale Issues that haven't received updates label Jan 30, 2026

DN6 mentioned this pull request Jan 30, 2026

[feat] added fal-flashpack support #12999

Open

5 tasks

iwr-redmond mentioned this pull request Feb 3, 2026

Feat: fal flashpack support #12707

Open

DN6 reviewed Feb 12, 2026

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FlashPack#12700

FlashPack#12700
hlky wants to merge 13 commits intohuggingface:mainfrom
hlky:flashpack

hlky commented Nov 22, 2025 •

edited

Loading

Uh oh!

github-actions bot commented Jan 9, 2026

Uh oh!

iwr-redmond commented Jan 30, 2026

Uh oh!

DN6 left a comment

Uh oh!

DN6 Feb 12, 2026

Uh oh!

DN6 Feb 12, 2026

Uh oh!

DN6 Feb 12, 2026

Uh oh!

DN6 Feb 12, 2026

Uh oh!

DN6 Feb 12, 2026

Uh oh!

DN6 Feb 12, 2026

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

	use_flashpack = kwargs.pop("use_flashpack", True)
	use_flashpack = kwargs.pop("use_flashpack", False)

		return parsed_parameters


		def load_flashpack_checkpoint(flashpack_checkpoint_path: str):

	flashpack_device = device_map[""]
	device = device_map[""]
	flashpack_device = torch.device(device) if not isinstance(device, torch.device) else device

	"FlashPack `device_map` should be a device, not one of `auto`, `balanced`, `balanced_low_0`, `sequential`."
	"FlashPack `device_map` should not be one of `auto`, `balanced`, `balanced_low_0`, `sequential`. Use a specific device instead, e.g., `device_map='cuda'` or `device_map='cuda:0'"


	if dtype_orig is not None:
	torch.set_default_dtype(dtype_orig)

Conversation

hlky commented Nov 22, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

What does this PR do?

Who can review?

Uh oh!

github-actions bot commented Jan 9, 2026

Uh oh!

iwr-redmond commented Jan 30, 2026

Uh oh!

DN6 left a comment

Choose a reason for hiding this comment

Uh oh!

DN6 Feb 12, 2026

Choose a reason for hiding this comment

Uh oh!

DN6 Feb 12, 2026

Choose a reason for hiding this comment

Uh oh!

DN6 Feb 12, 2026

Choose a reason for hiding this comment

Uh oh!

DN6 Feb 12, 2026

Choose a reason for hiding this comment

Uh oh!

DN6 Feb 12, 2026

Choose a reason for hiding this comment

Uh oh!

DN6 Feb 12, 2026

Choose a reason for hiding this comment

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

hlky commented Nov 22, 2025 •

edited

Loading