Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion xrspatial/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -1027,7 +1027,7 @@ def open_geotiff(self, source, **kwargs):
y_min, y_max = float(y.min()), float(y.max())
x_min, x_max = float(x.min()), float(x.max())

geo_info, file_h, file_w = _read_geo_info(source)
geo_info, file_h, file_w, _dtype, _nbands = _read_geo_info(source)
t = geo_info.transform

# Expand extent by half a pixel so we capture edge pixels
Expand Down
32 changes: 23 additions & 9 deletions xrspatial/geotiff/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,19 @@ def _coords_to_transform(da: xr.DataArray) -> GeoTransform | None:
)


def _read_geo_info(source: str):
def _read_geo_info(source: str, *, overview_level: int | None = None):
"""Read only the geographic metadata and image dimensions from a GeoTIFF.

Returns (geo_info, height, width) without reading pixel data.
Returns (geo_info, height, width, dtype, n_bands) without reading pixel
data. Uses mmap for header-only access -- O(1) memory regardless of file
size.

Parameters
----------
overview_level : int or None
Overview IFD index (0 = full resolution).
"""
from ._dtypes import tiff_dtype_to_numpy
from ._geotags import extract_geo_info
from ._header import parse_all_ifds, parse_header

Expand All @@ -128,9 +136,17 @@ def _read_geo_info(source: str):
try:
header = parse_header(data)
ifds = parse_all_ifds(data, header)
ifd = ifds[0]
ifd_idx = 0
if overview_level is not None:
ifd_idx = min(overview_level, len(ifds) - 1)
ifd = ifds[ifd_idx]
geo_info = extract_geo_info(ifd, data, header.byte_order)
return geo_info, ifd.height, ifd.width
bps = ifd.bits_per_sample
if isinstance(bps, tuple):
bps = bps[0]
file_dtype = tiff_dtype_to_numpy(bps, ifd.sample_format)
n_bands = ifd.samples_per_pixel if ifd.samples_per_pixel > 1 else 0
return geo_info, ifd.height, ifd.width, file_dtype, n_bands
finally:
data.close()

Expand Down Expand Up @@ -873,11 +889,9 @@ def read_geotiff_dask(source: str, *, dtype=None, chunks: int | tuple = 512,
if source.lower().endswith('.vrt'):
return read_vrt(source, dtype=dtype, name=name, chunks=chunks)

# First, do a metadata-only read to get shape, dtype, coords, attrs
arr, geo_info = read_to_array(source, overview_level=overview_level)
full_h, full_w = arr.shape[:2]
n_bands = arr.shape[2] if arr.ndim == 3 else 0
file_dtype = arr.dtype
# Metadata-only read: O(1) memory via mmap, no pixel decompression
geo_info, full_h, full_w, file_dtype, n_bands = _read_geo_info(
source, overview_level=overview_level)
nodata = geo_info.nodata

# Nodata masking promotes integer arrays to float64 (for NaN).
Expand Down
53 changes: 31 additions & 22 deletions xrspatial/reproject/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -965,20 +965,18 @@ def _reproject_dask_cupy(
resampling, nodata, precision,
chunk_size,
):
"""Dask+CuPy backend: process output chunks on GPU sequentially.

Instead of dask.delayed per chunk (which has ~15ms overhead each from
pyproj init + small CUDA launches), we:
1. Create CRS/transformer objects once
2. Use GPU-sized output chunks (2048x2048 by default)
3. For each output chunk, compute CUDA coordinates and fetch only
the source window needed from the dask array
4. Assemble the result as a CuPy array

For sources that fit in GPU memory, this is ~22x faster than the
dask.delayed path. For sources that don't fit, each chunk fetches
only its required window, so GPU memory usage scales with chunk size,
not source size.
"""Dask+CuPy backend: process output chunks on GPU.

Two modes based on available GPU memory:

**Fast path** (output fits in GPU VRAM): pre-allocates the full output
on GPU and fills it chunk-by-chunk. ~22x faster than the map_blocks
path because CRS/transformer objects are created once and CUDA kernels
run with minimal launch overhead.

**Chunked fallback** (output exceeds GPU VRAM): delegates to
``_reproject_dask(is_cupy=True)`` which uses ``map_blocks`` and
processes one chunk at a time with O(chunk_size) GPU memory.
"""
import cupy as cp

Expand All @@ -999,18 +997,29 @@ def _reproject_dask_cupy(
src_res_x = (src_right - src_left) / src_w
src_res_y = (src_top - src_bottom) / src_h

# Memory guard: the full output is allocated on GPU.
# Memory check: if the full output doesn't fit in GPU memory,
# fall back to the map_blocks path which is O(chunk_size) memory.
estimated = out_shape[0] * out_shape[1] * 8 # float64
try:
free_gpu, _total = cp.cuda.Device().mem_info
if estimated > 0.8 * free_gpu:
raise MemoryError(
f"_reproject_dask_cupy needs ~{estimated / 1e9:.1f} GB on GPU "
f"for the full output but only ~{free_gpu / 1e9:.1f} GB free. "
f"Reduce output resolution or use the dask+numpy path."
)
fits_in_gpu = estimated < 0.5 * free_gpu
except (AttributeError, RuntimeError):
pass # no device info available
fits_in_gpu = False

if not fits_in_gpu:
import warnings
warnings.warn(
f"Output ({estimated / 1e9:.1f} GB) exceeds GPU memory; "
f"falling back to chunked map_blocks path.",
stacklevel=3,
)
return _reproject_dask(
raster, src_bounds, src_shape, y_desc,
src_wkt, tgt_wkt,
out_bounds, out_shape,
resampling, nodata, precision,
chunk_size or 2048, True, # is_cupy=True
)

result = cp.full(out_shape, nodata, dtype=cp.float64)

Expand Down
58 changes: 42 additions & 16 deletions xrspatial/sieve.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,10 +131,32 @@ def _label_connected(data, valid, neighborhood):
):
_uf_union(parent, rank, idx, (r - 1) * cols + (c + 1))

# --- Count unique regions first so region_val_buf is right-sized ---
# Reuse rank array (no longer needed after union-find) as root_to_id.
# This eliminates a separate n-element int32 allocation.
root_to_id = rank # alias; rank is dead after union-find
for i in range(n):
root_to_id[i] = 0 # clear

n_regions = 0
for i in range(n):
r = i // cols
c = i % cols
if not valid[r, c]:
continue
root = _uf_find(parent, i)
if root_to_id[root] == 0:
root_to_id[root] = 1 # mark as seen
n_regions += 1

# Allocate region_val_buf at actual region count, not pixel count.
# For a 46K x 46K raster with 100K regions this saves ~16 GB.
region_val_buf = np.full(n_regions + 1, np.nan, dtype=np.float64)

# Assign contiguous region IDs
region_map_flat = np.zeros(n, dtype=np.int32)
root_to_id = np.zeros(n, dtype=np.int32)
region_val_buf = np.full(n + 1, np.nan, dtype=np.float64)
for i in range(n):
root_to_id[i] = 0 # clear for ID assignment
next_id = 1

for i in range(n):
Expand Down Expand Up @@ -319,14 +341,17 @@ def _available_memory_bytes():
def _sieve_dask(data, threshold, neighborhood, skip_values):
"""Dask+numpy backend: compute to numpy, sieve, wrap back."""
avail = _available_memory_bytes()
estimated_bytes = np.prod(data.shape) * data.dtype.itemsize
if estimated_bytes * 5 > 0.5 * avail:
n_pixels = np.prod(data.shape)
# Peak memory: input + result (float64 each) + parent + rank +
# region_map_flat (int32 each) = 2*8 + 3*4 = 28 bytes/pixel.
estimated_bytes = n_pixels * 28
if estimated_bytes > 0.5 * avail:
raise MemoryError(
f"sieve() needs the full array in memory "
f"(~{estimated_bytes * 5 / 1e9:.1f} GB) but only "
f"~{avail / 1e9:.1f} GB is available. Connected-component "
f"labeling is a global operation that cannot be chunked. "
f"Consider downsampling or tiling the input manually."
f"sieve() needs ~{estimated_bytes / 1e9:.1f} GB for the full "
f"array plus CCL bookkeeping, but only ~{avail / 1e9:.1f} GB "
f"is available. Connected-component labeling is a global "
f"operation that cannot be chunked. Consider downsampling "
f"or tiling the input manually."
)

np_data = data.compute()
Expand All @@ -338,18 +363,19 @@ def _sieve_dask(data, threshold, neighborhood, skip_values):

def _sieve_dask_cupy(data, threshold, neighborhood, skip_values):
"""Dask+CuPy backend: compute to cupy, sieve via CPU fallback, wrap back."""
estimated_bytes = np.prod(data.shape) * data.dtype.itemsize
n_pixels = np.prod(data.shape)
estimated_bytes = n_pixels * 28
try:
import cupy as cp

free_gpu, _total = cp.cuda.Device().mem_info
if estimated_bytes * 5 > 0.5 * free_gpu:
if estimated_bytes > 0.5 * free_gpu:
raise MemoryError(
f"sieve() needs the full array on GPU "
f"(~{estimated_bytes * 5 / 1e9:.1f} GB) but only "
f"~{free_gpu / 1e9:.1f} GB free. Connected-component "
f"labeling is a global operation that cannot be chunked. "
f"Consider downsampling or tiling the input manually."
f"sieve() needs ~{estimated_bytes / 1e9:.1f} GB for the "
f"full array plus CCL bookkeeping, but only "
f"~{free_gpu / 1e9:.1f} GB free GPU memory. Connected-"
f"component labeling is a global operation that cannot be "
f"chunked. Consider downsampling or tiling the input."
)
except (ImportError, AttributeError):
pass
Expand Down
Loading