From 52cf06bf1e63062c1790880692cb4a7ba22ba60b Mon Sep 17 00:00:00 2001 From: David Abramov Date: Mon, 6 Apr 2026 15:07:23 -0700 Subject: [PATCH 01/10] adding moon rock settings to config.yml --- config.yml | 26 ++++++++++++++++++++++++++ orchestration/flows/bl832/config.py | 1 + 2 files changed, 27 insertions(+) diff --git a/config.yml b/config.yml index ee5a72c6..69a2c077 100644 --- a/config.yml +++ b/config.yml @@ -175,6 +175,7 @@ mlflow: registry_uri: https://mlflow-staging.computing.als.lbl.gov hpc_submission_settings832: + # ── RECON + MULTIRES SETTINGS ─────────────────────────────────────────────── nersc_reconstruction: # ── SLURM resource allocation ───────────────────────────────────────────── qos: realtime @@ -190,6 +191,8 @@ hpc_submission_settings832: reservation: "" cpus-per-task: 128 walltime: "0:15:00" + + # ── PETIOLE SEGMENTATION SETTINGS ─────────────────────────────────────────── nersc_segmentation_sam3: # ── SLURM resource allocation ───────────────────────────────────────────── qos: regular @@ -258,3 +261,26 @@ hpc_submission_settings832: cfs_path: /global/cfs/cdirs/als/data_mover/8.3.2 conda_env_path: /global/cfs/cdirs/als/data_mover/8.3.2/envs/dino_demo seg_scripts_dir: /global/cfs/cdirs/als/data_mover/8.3.2/tomography_segmentation_scripts/inference_latest/forge_feb_seg_model_demo + + # ── MOON SEGMENTATION SETTINGS ─────────────────────────────────────────── + nersc_segmentation_dinov3_moon: + # ── SLURM resource allocation ───────────────────────────────────────────── + qos: regular + account: als + constraint: gpu + reservation: "" + num_nodes: 4 + ntasks-per-node: 1 + nproc_per_node: 4 + gpus-per-node: 4 + cpus-per-task: 128 + walltime: "00:59:00" + # ── Inference parameters ────────────────────────────────────────────────── + script_name: "src.inference_dino_v2" + project: "moon" + batch_size: 4 + # ── Paths ───────────────────────────────────────────────────────────────── + cfs_path: /global/cfs/cdirs/als/data_mover/8.3.2 + conda_env_path: /global/cfs/cdirs/als/data_mover/8.3.2/envs/dino_demo + seg_scripts_dir: /global/cfs/cdirs/als/data_mover/8.3.2/tomography_segmentation_scripts/inference_v5_multiseg/forge_feb_seg_model_demo/ + dino_checkpoint_path: /global/cfs/cdirs/als/data_mover/8.3.2/tomography_segmentation_scripts/dino_moon/best.ckpt diff --git a/orchestration/flows/bl832/config.py b/orchestration/flows/bl832/config.py index 441509f1..8bbbf78c 100644 --- a/orchestration/flows/bl832/config.py +++ b/orchestration/flows/bl832/config.py @@ -38,3 +38,4 @@ def _beam_specific_config(self) -> None: self.nersc_segment_sam3_settings = self.config["hpc_submission_settings832"]["nersc_segmentation_sam3"] self.nersc_segment_dinov3_settings = self.config["hpc_submission_settings832"]["nersc_segmentation_dinov3"] self.nersc_combine_segmentation_settings = self.config["hpc_submission_settings832"]["nersc_combine_segmentations"] + self.nersc_segment_dinov3_moon_settings = self.config["hpc_submission_settings832"]["nersc_segmentation_dinov3_moon"] From 07b9a53f5aa6d48c45cfdd97f3cc4db8e55d1bd1 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Mon, 6 Apr 2026 15:08:15 -0700 Subject: [PATCH 02/10] generalizing dinov3 segmentation to multiple projects, defining a class helper function _get_segmentation_spec(). created a new flow for moon rock segmentation --- orchestration/flows/bl832/nersc.py | 292 +++++++++++++++++++++++++++-- 1 file changed, 281 insertions(+), 11 deletions(-) diff --git a/orchestration/flows/bl832/nersc.py b/orchestration/flows/bl832/nersc.py index 4e6299d5..cbe2482a 100644 --- a/orchestration/flows/bl832/nersc.py +++ b/orchestration/flows/bl832/nersc.py @@ -1,3 +1,4 @@ +from dataclasses import dataclass, field import datetime from dotenv import load_dotenv import json @@ -29,6 +30,31 @@ load_dotenv() +@dataclass +class SegmentationModelSpec: + """All config-resolution inputs for a single model+project combination. + + Consumed by ``_load_job_options`` and the job-script builders. + Adding a new model or project means adding one entry to the registry — + nothing else changes. + + :param variable_name: Prefect Variable name for runtime overrides. + :param settings: Config settings dict (from Config832) for base defaults. + :param mlflow_model_name: Registered MLflow model name. + :param mlflow_checkpoint_key: Config key populated from the MLflow + model's ``nersc_path`` tag. + :param output_subdir: Subdirectory written under ``seg_folder/``, + e.g. ``'dino'``, ``'sam3'``, ``'dino_moon'``. + :param extra_cli_flags: Additional flags injected into the inference + command, e.g. ``{'--project': 'moon'}``. Omit flags not needed. + """ + variable_name: str + settings: dict[str, Any] + mlflow_model_name: str + mlflow_checkpoint_key: str + extra_cli_flags: dict[str, str] = field(default_factory=dict) + + def _load_job_options( variable_name: str, config_settings: dict[str, Any], @@ -163,6 +189,44 @@ def create_sfapi_client() -> Client: logger.error(f"Failed to create NERSC client: {e}") raise e + def _get_segmentation_spec(self, model: str, project: str) -> SegmentationModelSpec: + """Return the SegmentationModelSpec for a model+project combination. + + :param model: Model family, e.g. ``'dinov3'`` or ``'sam3'``. + :param project: Experiment project, e.g. ``'petiole'`` or ``'moon'``. + :return: The corresponding SegmentationModelSpec. + :raises ValueError: If the combination is not registered. + """ + registry: dict[tuple[str, str], SegmentationModelSpec] = { + ("dinov3", "petiole"): SegmentationModelSpec( + variable_name="nersc-dinov3-seg-options", + settings=self.config.nersc_segment_dinov3_settings, + mlflow_model_name="dinov3-petiole", + mlflow_checkpoint_key="dino_checkpoint_path", + ), + ("dinov3", "moon"): SegmentationModelSpec( + variable_name="nersc-dinov3-moon-seg-options", + settings=self.config.nersc_segment_dinov3_moon_settings, + mlflow_model_name="dinov3-moon", + mlflow_checkpoint_key="dino_checkpoint_path", + extra_cli_flags={"--project": "moon"}, + ), + ("sam3", "petiole"): SegmentationModelSpec( + variable_name="nersc-segmentation-options", + settings=self.config.nersc_segment_sam3_settings, + mlflow_model_name="sam3-petiole", + mlflow_checkpoint_key="finetuned_checkpoint_path", + ), + # future: ("sam3", "moon"): SegmentationModelSpec(...), + } + key = (model, project) + if key not in registry: + raise ValueError( + f"No segmentation spec registered for model={model!r}, project={project!r}. " + f"Registered combinations: {list(registry)}" + ) + return registry[key] + def reconstruct( self, file_path: str = "", @@ -850,12 +914,14 @@ def segmentation_sam3( def segmentation_dinov3( self, recon_folder_path: str = "", + project: str = "petiole", ) -> bool: """ Run DINOv3 segmentation at NERSC Perlmutter via SFAPI Slurm job. :param recon_folder_path: Relative path to the reconstructed data folder, e.g. 'folder_name/recYYYYMMDD_hhmmss_scanname/' + :param project: Project name for segmentation settings. :return: True if the job completed successfully, False otherwise. """ logger.info("Starting NERSC DINOv3 segmentation process.") @@ -864,13 +930,17 @@ def segmentation_dinov3( pscratch_path = f"/pscratch/sd/{user.name[0]}/{user.name}" # Load from config - + spec = self._get_segmentation_spec("dinov3", project) opts = _load_job_options( - "nersc-dinov3-seg-options", - self.config.nersc_segment_dinov3_settings, + variable_name=spec.variable_name, + settings=spec.settings, config=self.config, - mlflow_model_name="dinov3-petiole", - mlflow_checkpoint_key="dino_checkpoint_path", + mlflow_model_name=spec.mlflow_model_name, + mlflow_checkpoint_key=spec.mlflow_checkpoint_key, + ) + + extra_flags = "\n".join( + f" {flag} {value} \\" for flag, value in spec.extra_cli_flags.items() ) cfs_path = opts["cfs_path"] @@ -967,6 +1037,7 @@ def segmentation_dinov3( --output-dir "{output_dir}" \\ --batch-size {batch_size} \\ --finetuned-checkpoint "{dino_checkpoint}" \\ + {extra_flags} --save-overlay SEG_STATUS=$? @@ -1758,7 +1829,7 @@ def nersc_petiole_segment_flow( recon_folder_path=scratch_path_tiff, config=config ) dinov3_future = nersc_segmentation_dinov3_task.submit( - recon_folder_path=scratch_path_tiff, config=config + recon_folder_path=scratch_path_tiff, config=config, project="petiole" ) # ── STEP 4: Transfer each model's output as it completes ───────────────── @@ -1804,7 +1875,7 @@ def nersc_petiole_segment_flow( logger.info("Running segmentation combination.") combine_future = nersc_combine_segmentations_task.submit( - recon_folder_path=scratch_path_tiff, config=config + recon_folder_path=scratch_path_tiff, config=config, project="petiole" ) combine_success = combine_future.result() @@ -1925,6 +1996,203 @@ def nersc_petiole_segment_flow( return False +@flow(name="nersc_moon_segment_flow", flow_run_name="nersc_moon_seg-{file_path}") +def nersc_moon_segment_flow( + file_path: str, + config: Config832 | None = None, + num_nodes: int | None = None, +) -> bool: + """Reconstruct a lunar regolith scan and run DINOv3-moon segmentation. + + Runs reconstruction then DINOv3-moon (ice, particles, pores). No SAM3 or + combine step — those are petiole-specific. Transfer and pruning follow the + same pattern as nersc_petiole_segment_flow. + + :param file_path: Path to the raw .h5 file to be processed. + :param config: Configuration object for the flow. + :param num_nodes: Number of nodes for reconstruction. + :return: True if reconstruction and segmentation both succeeded. + """ + logger = get_run_logger() + + if config is None: + logger.info("Initializing Config") + config = Config832() + + path = Path(file_path) + folder_name = path.parent.name + file_name = path.stem + scratch_path_tiff = f"{folder_name}/rec{file_name}" + scratch_path_segment = f"{folder_name}/seg{file_name}" + + logger.info(f"Starting NERSC reconstruction + DINOv3-moon flow for {file_path=}") + + controller = get_controller(hpc_type=HPC.NERSC, config=config) + + if num_nodes is None: + num_nodes = config.nersc_recon_settings.get("num_nodes", 4) + logger.info(f"Configured to use {num_nodes} nodes for reconstruction") + + # ── STEP 1: Reconstruction ──────────────────────────────────────────────── + recon_result = controller.reconstruct(file_path=file_path, num_nodes=num_nodes) + + if isinstance(recon_result, dict): + nersc_reconstruction_success = recon_result.get("success", False) + timing = recon_result.get("timing") + if timing: + logger.info("=" * 50) + logger.info("TIMING BREAKDOWN") + logger.info("=" * 50) + logger.info(f" Total job time: {timing.get('total', 'N/A')}s") + logger.info(f" Container pull: {timing.get('container_pull', 'N/A')}s") + logger.info( + f" File copy: {timing.get('file_copy', 'N/A')}s " + f"(skipped: {timing.get('copy_skipped', 'N/A')})" + ) + logger.info(f" Metadata detection: {timing.get('metadata', 'N/A')}s") + logger.info(f" RECONSTRUCTION: {timing.get('reconstruction', 'N/A')}s <-- actual recon time") + logger.info(f" Num slices: {timing.get('num_slices', 'N/A')}") + logger.info("=" * 50) + if all(k in timing for k in ["total", "reconstruction"]): + overhead = timing["total"] - timing["reconstruction"] + logger.info(f" Overhead: {overhead}s") + logger.info(f" Reconstruction %: {100 * timing['reconstruction'] / timing['total']:.1f}%") + logger.info("=" * 50) + else: + nersc_reconstruction_success = recon_result + + logger.info(f"NERSC reconstruction success: {nersc_reconstruction_success}") + + if not nersc_reconstruction_success: + logger.error("Reconstruction failed — aborting moon segmentation flow.") + raise ValueError("Reconstruction at NERSC failed") + + # ── STEP 2: Transfer TIFFs to data832 ──────────────────────────────────── + data832_tiff_future = None + try: + data832_tiff_future = globus_transfer_task.submit( + file_path=scratch_path_tiff, + source=config.nersc832_alsdev_pscratch_scratch, + destination=config.data832_scratch, + config=config, + ) + logger.info("TIFF transfer to data832 submitted.") + except Exception as e: + logger.error(f"Failed to submit TIFF transfer to data832: {e}") + + # ── STEP 3: DINOv3-moon segmentation ───────────────────────────────────── + logger.info("Submitting DINOv3-moon segmentation task.") + moon_future = nersc_segmentation_dinov3_task.submit( + recon_folder_path=scratch_path_tiff, config=config, project="moon" + ) + + moon_success = moon_future.result() + logger.info(f"DINOv3-moon segmentation result: {moon_success}") + + # ── STEP 4: Transfer segmentation outputs to data832 ───────────────────── + data832_moon_future = None + if moon_success: + moon_segment_path = f"{folder_name}/seg{file_name}/dino" + try: + data832_moon_future = globus_transfer_task.submit( + file_path=moon_segment_path, + source=config.nersc832_alsdev_pscratch_scratch, + destination=config.data832_scratch, + config=config, + ) + logger.info("DINOv3-moon transfer to data832 submitted.") + except Exception as e: + logger.error(f"Failed to submit DINOv3-moon transfer to data832: {e}") + + # ── STEP 5: Copy to NERSC CFS ───────────────────────────────────────────── + for cfs_path in [scratch_path_tiff, scratch_path_segment]: + try: + globus_transfer_task.submit( + file_path=cfs_path, + source=config.nersc832_alsdev_pscratch_scratch, + destination=config.nersc832_alsdev_scratch, + config=config, + ) + logger.info(f"CFS transfer submitted: {cfs_path}") + except Exception as e: + logger.error(f"Failed to copy {cfs_path} to NERSC CFS: {e}") + + # ── Resolve futures before pruning ──────────────────────────────────────── + data832_tiff_transfer_success = data832_tiff_future.result() if data832_tiff_future else False + data832_moon_transfer_success = data832_moon_future.result() if data832_moon_future else False + + logger.info( + f"Transfer results — tiff: {data832_tiff_transfer_success}, " + f"moon: {data832_moon_transfer_success}" + ) + + # ── STEP 6: Pruning ─────────────────────────────────────────────────────── + logger.info("Scheduling file pruning tasks.") + prune_controller = get_prune_controller(prune_type=PruneMethod.GLOBUS, config=config) + + try: + prune_controller.prune( + file_path=f"{folder_name}/{path.name}", + source_endpoint=config.nersc832_alsdev_pscratch_raw, + check_endpoint=None, + days_from_now=1.0, + ) + except Exception as e: + logger.warning(f"Failed to schedule raw data pruning: {e}") + + try: + prune_controller.prune( + file_path=scratch_path_tiff, + source_endpoint=config.nersc832_alsdev_pscratch_scratch, + check_endpoint=config.data832_scratch if data832_tiff_transfer_success else None, + days_from_now=1.0, + ) + except Exception as e: + logger.warning(f"Failed to schedule reconstruction data pruning: {e}") + + if moon_success: + try: + prune_controller.prune( + file_path=scratch_path_segment, + source_endpoint=config.nersc832_alsdev_pscratch_scratch, + check_endpoint=config.data832_scratch if data832_moon_transfer_success else None, + days_from_now=1.0, + ) + except Exception as e: + logger.warning(f"Failed to schedule segmentation data pruning: {e}") + + if data832_tiff_transfer_success: + try: + prune_controller.prune( + file_path=scratch_path_tiff, + source_endpoint=config.data832_scratch, + check_endpoint=None, + days_from_now=30.0, + ) + except Exception as e: + logger.warning(f"Failed to schedule data832 tiff pruning: {e}") + + if data832_moon_transfer_success: + try: + prune_controller.prune( + file_path=scratch_path_segment, + source_endpoint=config.data832_scratch, + check_endpoint=None, + days_from_now=30.0, + ) + except Exception as e: + logger.warning(f"Failed to schedule data832 moon segment pruning: {e}") + + if nersc_reconstruction_success and moon_success: + logger.info("NERSC reconstruction + DINOv3-moon flow completed successfully.") + return True + else: + logger.warning( + f"Flow completed with issues: recon={nersc_reconstruction_success}, moon={moon_success}" + ) + return False + + @flow(name="nersc_streaming_flow", on_cancellation=[cancellation_hook]) def nersc_streaming_flow( walltime: datetime.timedelta = datetime.timedelta(minutes=5), @@ -2079,14 +2347,15 @@ def nersc_segmentation_sam3_task( def nersc_segmentation_dinov3_task( recon_folder_path: str, config: Optional[Config832] = None, + project: Optional[str] = "petiole", ) -> bool: logger = get_run_logger() if config is None: logger.info("No config provided, using default Config832.") config = Config832() tomography_controller = get_controller(hpc_type=HPC.NERSC, config=config) - logger.info(f"Starting NERSC DINOv3 segmentation task for {recon_folder_path=}") - success = tomography_controller.segmentation_dinov3(recon_folder_path=recon_folder_path) + logger.info(f"Starting NERSC DINOv3 segmentation task for {recon_folder_path=}, {project=}") + success = tomography_controller.segmentation_dinov3(recon_folder_path=recon_folder_path, project=project) if not success: logger.error("DINOv3 segmentation failed.") else: @@ -2098,14 +2367,15 @@ def nersc_segmentation_dinov3_task( def nersc_combine_segmentations_task( recon_folder_path: str, config: Optional[Config832] = None, + project: Optional[str] = None, ) -> bool: logger = get_run_logger() if config is None: logger.info("No config provided, using default Config832.") config = Config832() tomography_controller = get_controller(hpc_type=HPC.NERSC, config=config) - logger.info(f"Starting NERSC combine segmentations task for {recon_folder_path=}") - success = tomography_controller.combine_segmentations(recon_folder_path=recon_folder_path) + logger.info(f"Starting NERSC combine segmentations task for {recon_folder_path=}, {project=}") + success = tomography_controller.combine_segmentations(recon_folder_path=recon_folder_path, project=project) if not success: logger.error("Combine segmentations failed.") else: From 5249be3b24efd5f11209b650ffacd36aa76ca308 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Mon, 6 Apr 2026 15:15:14 -0700 Subject: [PATCH 03/10] Adding block to register DINOv3-moon to MLflow --- orchestration/flows/bl832/register_mlflow.py | 56 ++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/orchestration/flows/bl832/register_mlflow.py b/orchestration/flows/bl832/register_mlflow.py index 2b202057..814d622b 100644 --- a/orchestration/flows/bl832/register_mlflow.py +++ b/orchestration/flows/bl832/register_mlflow.py @@ -61,6 +61,22 @@ def register_mlflow_checkpoints(): }, ) + register_checkpoint( + model_name="dinov3-moon", + nersc_path="/global/cfs/cdirs/als/data_mover/8.3.2/tomography_segmentation_scripts/dino_moon/best.ckpt", + alcf_path="/eagle/IRIBeta/als/seg_models/dino_moon/best.ckpt", + config=config, + alias="production", + description="DINOv3 fine-tuned on lunar regolith micro-CT data (ice, particles, pores).", + inference_params={ + "conda_env_path": "/global/cfs/cdirs/als/data_mover/8.3.2/envs/dino_demo", + "seg_scripts_dir": f"{scripts_dir}inference_v5_multiseg/forge_feb_seg_model_demo/", + "script_name": "src.inference_dino_v2", + "batch_size": 4, + "nproc_per_node": 4, + }, + ) + def retrieve_mlflow_params_test() -> bool: """Test that _load_job_options correctly pulls inference params from the MLflow registry. @@ -179,6 +195,46 @@ def retrieve_mlflow_params_test() -> bool: if not passed: all_passed = False + # ── DINOv3-moon ─────────────────────────────────────────────────────────── + logger.info("=" * 60) + logger.info("Testing DINOv3-moon option resolution") + logger.info("=" * 60) + + moon_opts = _load_job_options( + "nersc-dinov3-moon-seg-options", + config.nersc_segment_dinov3_moon_settings, + config=config, + mlflow_model_name="dinov3-moon", + mlflow_checkpoint_key="dino_checkpoint_path", + ) + + moon_checks = { + "dino_checkpoint_path": ( + lambda v: v.endswith(".ckpt"), + "dino_checkpoint_path should end with .ckpt" + ), + "script_name": ( + lambda v: "v2" in v.lower(), + "script_name should reference inference_dino_v2" + ), + "batch_size": ( + lambda v: isinstance(v, int) and v > 0, + "batch_size should be a positive int" + ), + "qos": ( + lambda v: v == config.nersc_segment_dinov3_moon_settings["qos"], + "qos should be unchanged from config" + ), + } + + for key, (check_fn, message) in moon_checks.items(): + value = moon_opts.get(key) + passed = check_fn(value) if value is not None else False + status = "✓" if passed else "✗" + logger.info(f" [{status}] {key}={value!r} — {message}") + if not passed: + all_passed = False + # ── Summary ─────────────────────────────────────────────────────────────── logger.info("=" * 60) if all_passed: From 83e6a7b4c654727b81a38cb274e6484de7472057 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Mon, 6 Apr 2026 15:16:01 -0700 Subject: [PATCH 04/10] Adding moon segmentation to dispatcher options --- orchestration/flows/bl832/dispatcher.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/orchestration/flows/bl832/dispatcher.py b/orchestration/flows/bl832/dispatcher.py index 6f216f6d..d28a02eb 100644 --- a/orchestration/flows/bl832/dispatcher.py +++ b/orchestration/flows/bl832/dispatcher.py @@ -28,6 +28,10 @@ class FlowParameterMapper: "num_nodes", "config"], "nersc_petiole_segment_flow/nersc_petiole_segment_flow": [ + "file_path", + "num_nodes", + "config"], + "nersc_moon_segment_flow/nersc_moon_segment_flow": [ "file_path", "num_nodes", "config"] @@ -65,6 +69,7 @@ def setup_decision_settings( alcf_recon: bool, nersc_recon: bool, nersc_petiole_segment: bool, + nersc_moon_segment: bool, new_file_832: bool ) -> dict: """ @@ -73,6 +78,7 @@ def setup_decision_settings( :param alcf_recon: Boolean indicating whether to run the ALCF reconstruction flow. :param nersc_recon: Boolean indicating whether to run the NERSC reconstruction flow. :param nersc_petiole_segment: Boolean indicating whether to run the NERSC petiole segmentation flow. + :param nersc_moon_segment: Boolean indicating whether to run the NERSC moon segmentation flow. :param new_file_832: Boolean indicating whether to move files to NERSC. :return: A dictionary containing the settings for each flow. """ @@ -81,12 +87,14 @@ def setup_decision_settings( logger.info(f"Setting up decision settings: alcf_recon={alcf_recon}, " f"nersc_recon={nersc_recon}, " f"nersc_petiole_segment={nersc_petiole_segment}, " + f"nersc_moon_segment={nersc_moon_segment}, " f"new_file_832={new_file_832}") # Define which flows to run based on the input settings settings = { "alcf_recon_flow/alcf_recon_flow": alcf_recon, "nersc_recon_flow/nersc_recon_flow": nersc_recon, "nersc_petiole_segment_flow/nersc_petiole_segment_flow": nersc_petiole_segment, + "nersc_moon_segment_flow/nersc_moon_segment_flow": nersc_moon_segment, "new_832_file_flow/new_file_832": new_file_832 } # Save the settings in a JSON block for later retrieval by other flows @@ -172,6 +180,12 @@ async def dispatcher( run_recon_flow_async("nersc_petiole_segment_flow/nersc_petiole_segment_flow", nersc_petiole_segment_params) ) + if decision_settings.get("nersc_moon_segment_flow/nersc_moon_segment_flow"): + moon_params = FlowParameterMapper.get_flow_parameters( + "nersc_moon_segment_flow/nersc_moon_segment_flow", available_params + ) + tasks.append(run_recon_flow_async("nersc_moon_segment_flow/nersc_moon_segment_flow", moon_params)) + # Run ALCF and NERSC flows in parallel, if any if tasks: try: From a3e38f9e4c80cd9e831d5b6e00a19f2496b740a1 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Mon, 6 Apr 2026 15:40:49 -0700 Subject: [PATCH 05/10] Updating pytest --- orchestration/_tests/test_bl832/test_nersc.py | 90 ++++++++++++++++++- orchestration/flows/bl832/nersc.py | 13 ++- 2 files changed, 95 insertions(+), 8 deletions(-) diff --git a/orchestration/_tests/test_bl832/test_nersc.py b/orchestration/_tests/test_bl832/test_nersc.py index 3fa092ed..8d7056a8 100644 --- a/orchestration/_tests/test_bl832/test_nersc.py +++ b/orchestration/_tests/test_bl832/test_nersc.py @@ -540,7 +540,7 @@ def test_nersc_segmentation_dinov3_task_success(mocker, mock_config832): config=mock_config832 ) - mock_controller.segmentation_dinov3.assert_called_once_with(recon_folder_path="folder/recfile") + mock_controller.segmentation_dinov3.assert_called_once_with(recon_folder_path="folder/recfile", project="petiole") assert result is True @@ -706,3 +706,91 @@ def test_petiole_segment_flow_recon_failure(mocker, mock_config832): with pytest.raises(ValueError, match="Reconstruction at NERSC Failed"): nersc_petiole_segment_flow(file_path="folder/file.h5", num_nodes=4, config=None) + +# ────────────────────────────────────────────────────────────────────────────── +# nersc_moon_segment_flow (recon + DINOv3-moon only, no SAM3, no combine) +# ────────────────────────────────────────────────────────────────────────────── + + +def test_moon_segment_flow_succeeds(mocker, mock_config832, mock_recon_success): + """Recon + DINOv3-moon both succeed → flow returns True.""" + from orchestration.flows.bl832.nersc import nersc_moon_segment_flow + + mock_controller = mocker.MagicMock() + mock_controller.reconstruct.return_value = mock_recon_success + mocker.patch("orchestration.flows.bl832.nersc.get_controller", return_value=mock_controller) + + mock_globus_transfer = mocker.patch("orchestration.flows.bl832.nersc.globus_transfer_task") + mock_globus_transfer.submit.return_value = _make_future(mocker, True) + + mocker.patch("orchestration.flows.bl832.nersc.get_prune_controller", return_value=mocker.MagicMock()) + + mock_dinov3_task = mocker.patch("orchestration.flows.bl832.nersc.nersc_segmentation_dinov3_task") + mock_dinov3_task.submit.return_value = _make_future(mocker, True) + + result = nersc_moon_segment_flow(file_path="folder/file.h5", num_nodes=4, config=None) + + assert result is True + mock_controller.reconstruct.assert_called_once() + mock_dinov3_task.submit.assert_called_once_with( + recon_folder_path="folder/recfile", config=mock_config832, project="moon" + ) + + +def test_moon_segment_flow_seg_failure(mocker, mock_config832, mock_recon_success): + """Recon succeeds but DINOv3-moon fails → flow returns False.""" + from orchestration.flows.bl832.nersc import nersc_moon_segment_flow + + mock_controller = mocker.MagicMock() + mock_controller.reconstruct.return_value = mock_recon_success + mocker.patch("orchestration.flows.bl832.nersc.get_controller", return_value=mock_controller) + + mock_globus_transfer = mocker.patch("orchestration.flows.bl832.nersc.globus_transfer_task") + mock_globus_transfer.submit.return_value = _make_future(mocker, False) + + mocker.patch("orchestration.flows.bl832.nersc.get_prune_controller", return_value=mocker.MagicMock()) + + mock_dinov3_task = mocker.patch("orchestration.flows.bl832.nersc.nersc_segmentation_dinov3_task") + mock_dinov3_task.submit.return_value = _make_future(mocker, False) + + result = nersc_moon_segment_flow(file_path="folder/file.h5", num_nodes=4, config=None) + + assert result is False + + +def test_moon_segment_flow_recon_failure(mocker, mock_config832): + """Recon failure should raise ValueError immediately.""" + from orchestration.flows.bl832.nersc import nersc_moon_segment_flow + + mock_controller = mocker.MagicMock() + mock_controller.reconstruct.return_value = {"success": False, "job_id": None, "timing": None} + mocker.patch("orchestration.flows.bl832.nersc.get_controller", return_value=mock_controller) + mocker.patch("orchestration.flows.bl832.nersc.globus_transfer_task") + mocker.patch("orchestration.flows.bl832.nersc.get_prune_controller", return_value=mocker.MagicMock()) + + with pytest.raises(ValueError, match="Reconstruction at NERSC failed"): + nersc_moon_segment_flow(file_path="folder/file.h5", num_nodes=4, config=None) + + +def test_moon_segment_flow_no_sam3_no_combine(mocker, mock_config832, mock_recon_success): + """SAM3 and combine tasks should never be called in the moon flow.""" + from orchestration.flows.bl832.nersc import nersc_moon_segment_flow + + mock_controller = mocker.MagicMock() + mock_controller.reconstruct.return_value = mock_recon_success + mocker.patch("orchestration.flows.bl832.nersc.get_controller", return_value=mock_controller) + + mock_globus_transfer = mocker.patch("orchestration.flows.bl832.nersc.globus_transfer_task") + mock_globus_transfer.submit.return_value = _make_future(mocker, True) + + mocker.patch("orchestration.flows.bl832.nersc.get_prune_controller", return_value=mocker.MagicMock()) + + mock_sam3_task = mocker.patch("orchestration.flows.bl832.nersc.nersc_segmentation_sam3_task") + mock_combine_task = mocker.patch("orchestration.flows.bl832.nersc.nersc_combine_segmentations_task") + mock_dinov3_task = mocker.patch("orchestration.flows.bl832.nersc.nersc_segmentation_dinov3_task") + mock_dinov3_task.submit.return_value = _make_future(mocker, True) + + nersc_moon_segment_flow(file_path="folder/file.h5", num_nodes=4, config=None) + + mock_sam3_task.submit.assert_not_called() + mock_combine_task.submit.assert_not_called() diff --git a/orchestration/flows/bl832/nersc.py b/orchestration/flows/bl832/nersc.py index cbe2482a..e11a159c 100644 --- a/orchestration/flows/bl832/nersc.py +++ b/orchestration/flows/bl832/nersc.py @@ -644,8 +644,8 @@ def segmentation_sam3( pscratch_path = f"/pscratch/sd/{user.name[0]}/{user.name}" opts = _load_job_options( - "nersc-segmentation-options", - self.config.nersc_segment_sam3_settings, + variable_name="nersc-segmentation-options", + config_settings=self.config.nersc_segment_sam3_settings, config=self.config, mlflow_model_name="sam3-petiole", mlflow_checkpoint_key="finetuned_checkpoint_path", @@ -933,7 +933,7 @@ def segmentation_dinov3( spec = self._get_segmentation_spec("dinov3", project) opts = _load_job_options( variable_name=spec.variable_name, - settings=spec.settings, + config_settings=spec.settings, config=self.config, mlflow_model_name=spec.mlflow_model_name, mlflow_checkpoint_key=spec.mlflow_checkpoint_key, @@ -1875,7 +1875,7 @@ def nersc_petiole_segment_flow( logger.info("Running segmentation combination.") combine_future = nersc_combine_segmentations_task.submit( - recon_folder_path=scratch_path_tiff, config=config, project="petiole" + recon_folder_path=scratch_path_tiff, config=config ) combine_success = combine_future.result() @@ -2367,15 +2367,14 @@ def nersc_segmentation_dinov3_task( def nersc_combine_segmentations_task( recon_folder_path: str, config: Optional[Config832] = None, - project: Optional[str] = None, ) -> bool: logger = get_run_logger() if config is None: logger.info("No config provided, using default Config832.") config = Config832() tomography_controller = get_controller(hpc_type=HPC.NERSC, config=config) - logger.info(f"Starting NERSC combine segmentations task for {recon_folder_path=}, {project=}") - success = tomography_controller.combine_segmentations(recon_folder_path=recon_folder_path, project=project) + logger.info(f"Starting NERSC combine segmentations task for {recon_folder_path=}") + success = tomography_controller.combine_segmentations(recon_folder_path=recon_folder_path) if not success: logger.error("Combine segmentations failed.") else: From d5b239d23a524b65def535e927ab08abcfad0651 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Tue, 7 Apr 2026 10:47:15 -0700 Subject: [PATCH 06/10] Updating config for dino_moon inference script/checkpoint locations --- config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config.yml b/config.yml index 69a2c077..383fe22c 100644 --- a/config.yml +++ b/config.yml @@ -282,5 +282,5 @@ hpc_submission_settings832: # ── Paths ───────────────────────────────────────────────────────────────── cfs_path: /global/cfs/cdirs/als/data_mover/8.3.2 conda_env_path: /global/cfs/cdirs/als/data_mover/8.3.2/envs/dino_demo - seg_scripts_dir: /global/cfs/cdirs/als/data_mover/8.3.2/tomography_segmentation_scripts/inference_v5_multiseg/forge_feb_seg_model_demo/ - dino_checkpoint_path: /global/cfs/cdirs/als/data_mover/8.3.2/tomography_segmentation_scripts/dino_moon/best.ckpt + seg_scripts_dir: /global/cfs/cdirs/als/data_mover/8.3.2/tomography_segmentation_scripts/moon_seg/forge_feb_seg_model_demo/ + dino_checkpoint_path: /global/cfs/cdirs/als/data_mover/8.3.2/tomography_segmentation_scripts/dino/best_moon.ckpt From b2e82e51f2ae49cbc5ca53ce39b4d00460acd2f0 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Fri, 17 Apr 2026 10:11:15 -0700 Subject: [PATCH 07/10] Adding recent changes --- orchestration/flows/bl832/nersc.py | 8 ++++++++ orchestration/flows/bl832/register_mlflow.py | 6 +++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/orchestration/flows/bl832/nersc.py b/orchestration/flows/bl832/nersc.py index e11a159c..f4ffc9fb 100644 --- a/orchestration/flows/bl832/nersc.py +++ b/orchestration/flows/bl832/nersc.py @@ -2398,3 +2398,11 @@ def nersc_segmentation_sam3_integration_test() -> bool: ) logger.info(f"Flow success: {flow_success}") return flow_success + + +if __name__ == "__main__": + nersc_segmentation_dinov3_task( + recon_folder_path='dabramov/recmoon/', + config=Config832(), + project="moon" + ) diff --git a/orchestration/flows/bl832/register_mlflow.py b/orchestration/flows/bl832/register_mlflow.py index 814d622b..31fa3760 100644 --- a/orchestration/flows/bl832/register_mlflow.py +++ b/orchestration/flows/bl832/register_mlflow.py @@ -63,14 +63,14 @@ def register_mlflow_checkpoints(): register_checkpoint( model_name="dinov3-moon", - nersc_path="/global/cfs/cdirs/als/data_mover/8.3.2/tomography_segmentation_scripts/dino_moon/best.ckpt", - alcf_path="/eagle/IRIBeta/als/seg_models/dino_moon/best.ckpt", + nersc_path="/global/cfs/cdirs/als/data_mover/8.3.2/tomography_segmentation_scripts/dino/best_moon.ckpt", + alcf_path="/eagle/IRIBeta/als/seg_models/dino/best_moon.ckpt", config=config, alias="production", description="DINOv3 fine-tuned on lunar regolith micro-CT data (ice, particles, pores).", inference_params={ "conda_env_path": "/global/cfs/cdirs/als/data_mover/8.3.2/envs/dino_demo", - "seg_scripts_dir": f"{scripts_dir}inference_v5_multiseg/forge_feb_seg_model_demo/", + "seg_scripts_dir": f"{scripts_dir}moon_seg/forge_feb_seg_model_demo/", "script_name": "src.inference_dino_v2", "batch_size": 4, "nproc_per_node": 4, From 77fc204ee48a8f10d0cbd59a9e379b89880717a6 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Fri, 17 Apr 2026 10:18:06 -0700 Subject: [PATCH 08/10] adding nersc_moon_segment_flow to prefect.yaml --- orchestration/flows/bl832/prefect.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/orchestration/flows/bl832/prefect.yaml b/orchestration/flows/bl832/prefect.yaml index b827045e..aa9ed2c2 100644 --- a/orchestration/flows/bl832/prefect.yaml +++ b/orchestration/flows/bl832/prefect.yaml @@ -49,6 +49,12 @@ deployments: name: nersc_recon_flow_pool work_queue_name: nersc_petiole_segment_flow_queue +- name: nersc_moon_segment_flow + entrypoint: orchestration/flows/bl832/nersc.py:nersc_moon_segment_flow + work_pool: + name: nersc_recon_flow_pool + work_queue_name: nersc_moon_segment_flow_queue + - name: nersc_streaming_flow entrypoint: orchestration/flows/bl832/nersc.py:nersc_streaming_flow work_pool: From 943862f0b9f47a342fb339160933ca9f28a2ed0d Mon Sep 17 00:00:00 2001 From: David Abramov Date: Fri, 17 Apr 2026 14:01:20 -0700 Subject: [PATCH 09/10] Adding NERSC reservation info for moon seg beamtime --- config.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/config.yml b/config.yml index 383fe22c..ea576b54 100644 --- a/config.yml +++ b/config.yml @@ -180,8 +180,8 @@ hpc_submission_settings832: # ── SLURM resource allocation ───────────────────────────────────────────── qos: realtime account: als - reservation: "" - num_nodes: 4 + reservation: "_CAP_TOMO_MOON_CPU" + num_nodes: 16 cpus-per-task: 128 walltime: "0:30:00" nersc_multiresolution: @@ -268,7 +268,7 @@ hpc_submission_settings832: qos: regular account: als constraint: gpu - reservation: "" + reservation: "_CAP_TOMO_MOON_GPU" num_nodes: 4 ntasks-per-node: 1 nproc_per_node: 4 From e17a9ae5710a72257cfc79e55c8577d91875a2ef Mon Sep 17 00:00:00 2001 From: David Abramov Date: Mon, 20 Apr 2026 10:35:02 -0700 Subject: [PATCH 10/10] pinning authlib==1.6.1 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index f61194d6..7fdf24af 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -authlib +authlib==1.6.1 dynaconf globus-compute-sdk @ git+https://github.com/globus/globus-compute.git@d1731340074be56861ec91d732bdff44f8e2b46e#subdirectory=compute_sdk globus-sdk>=3.0