diff --git a/xrspatial/accessor.py b/xrspatial/accessor.py index bd3bb35d..c5b80670 100644 --- a/xrspatial/accessor.py +++ b/xrspatial/accessor.py @@ -1027,7 +1027,7 @@ def open_geotiff(self, source, **kwargs): y_min, y_max = float(y.min()), float(y.max()) x_min, x_max = float(x.min()), float(x.max()) - geo_info, file_h, file_w = _read_geo_info(source) + geo_info, file_h, file_w, _dtype, _nbands = _read_geo_info(source) t = geo_info.transform # Expand extent by half a pixel so we capture edge pixels diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 1a300afc..e93f525c 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -114,11 +114,19 @@ def _coords_to_transform(da: xr.DataArray) -> GeoTransform | None: ) -def _read_geo_info(source: str): +def _read_geo_info(source: str, *, overview_level: int | None = None): """Read only the geographic metadata and image dimensions from a GeoTIFF. - Returns (geo_info, height, width) without reading pixel data. + Returns (geo_info, height, width, dtype, n_bands) without reading pixel + data. Uses mmap for header-only access -- O(1) memory regardless of file + size. + + Parameters + ---------- + overview_level : int or None + Overview IFD index (0 = full resolution). """ + from ._dtypes import tiff_dtype_to_numpy from ._geotags import extract_geo_info from ._header import parse_all_ifds, parse_header @@ -128,9 +136,17 @@ def _read_geo_info(source: str): try: header = parse_header(data) ifds = parse_all_ifds(data, header) - ifd = ifds[0] + ifd_idx = 0 + if overview_level is not None: + ifd_idx = min(overview_level, len(ifds) - 1) + ifd = ifds[ifd_idx] geo_info = extract_geo_info(ifd, data, header.byte_order) - return geo_info, ifd.height, ifd.width + bps = ifd.bits_per_sample + if isinstance(bps, tuple): + bps = bps[0] + file_dtype = tiff_dtype_to_numpy(bps, ifd.sample_format) + n_bands = ifd.samples_per_pixel if ifd.samples_per_pixel > 1 else 0 + return geo_info, ifd.height, ifd.width, file_dtype, n_bands finally: data.close() @@ -873,11 +889,9 @@ def read_geotiff_dask(source: str, *, dtype=None, chunks: int | tuple = 512, if source.lower().endswith('.vrt'): return read_vrt(source, dtype=dtype, name=name, chunks=chunks) - # First, do a metadata-only read to get shape, dtype, coords, attrs - arr, geo_info = read_to_array(source, overview_level=overview_level) - full_h, full_w = arr.shape[:2] - n_bands = arr.shape[2] if arr.ndim == 3 else 0 - file_dtype = arr.dtype + # Metadata-only read: O(1) memory via mmap, no pixel decompression + geo_info, full_h, full_w, file_dtype, n_bands = _read_geo_info( + source, overview_level=overview_level) nodata = geo_info.nodata # Nodata masking promotes integer arrays to float64 (for NaN). diff --git a/xrspatial/reproject/__init__.py b/xrspatial/reproject/__init__.py index a454a349..f87f6c60 100644 --- a/xrspatial/reproject/__init__.py +++ b/xrspatial/reproject/__init__.py @@ -965,20 +965,18 @@ def _reproject_dask_cupy( resampling, nodata, precision, chunk_size, ): - """Dask+CuPy backend: process output chunks on GPU sequentially. - - Instead of dask.delayed per chunk (which has ~15ms overhead each from - pyproj init + small CUDA launches), we: - 1. Create CRS/transformer objects once - 2. Use GPU-sized output chunks (2048x2048 by default) - 3. For each output chunk, compute CUDA coordinates and fetch only - the source window needed from the dask array - 4. Assemble the result as a CuPy array - - For sources that fit in GPU memory, this is ~22x faster than the - dask.delayed path. For sources that don't fit, each chunk fetches - only its required window, so GPU memory usage scales with chunk size, - not source size. + """Dask+CuPy backend: process output chunks on GPU. + + Two modes based on available GPU memory: + + **Fast path** (output fits in GPU VRAM): pre-allocates the full output + on GPU and fills it chunk-by-chunk. ~22x faster than the map_blocks + path because CRS/transformer objects are created once and CUDA kernels + run with minimal launch overhead. + + **Chunked fallback** (output exceeds GPU VRAM): delegates to + ``_reproject_dask(is_cupy=True)`` which uses ``map_blocks`` and + processes one chunk at a time with O(chunk_size) GPU memory. """ import cupy as cp @@ -999,18 +997,29 @@ def _reproject_dask_cupy( src_res_x = (src_right - src_left) / src_w src_res_y = (src_top - src_bottom) / src_h - # Memory guard: the full output is allocated on GPU. + # Memory check: if the full output doesn't fit in GPU memory, + # fall back to the map_blocks path which is O(chunk_size) memory. estimated = out_shape[0] * out_shape[1] * 8 # float64 try: free_gpu, _total = cp.cuda.Device().mem_info - if estimated > 0.8 * free_gpu: - raise MemoryError( - f"_reproject_dask_cupy needs ~{estimated / 1e9:.1f} GB on GPU " - f"for the full output but only ~{free_gpu / 1e9:.1f} GB free. " - f"Reduce output resolution or use the dask+numpy path." - ) + fits_in_gpu = estimated < 0.5 * free_gpu except (AttributeError, RuntimeError): - pass # no device info available + fits_in_gpu = False + + if not fits_in_gpu: + import warnings + warnings.warn( + f"Output ({estimated / 1e9:.1f} GB) exceeds GPU memory; " + f"falling back to chunked map_blocks path.", + stacklevel=3, + ) + return _reproject_dask( + raster, src_bounds, src_shape, y_desc, + src_wkt, tgt_wkt, + out_bounds, out_shape, + resampling, nodata, precision, + chunk_size or 2048, True, # is_cupy=True + ) result = cp.full(out_shape, nodata, dtype=cp.float64) diff --git a/xrspatial/sieve.py b/xrspatial/sieve.py index 1ed66d91..ff35a226 100644 --- a/xrspatial/sieve.py +++ b/xrspatial/sieve.py @@ -131,10 +131,32 @@ def _label_connected(data, valid, neighborhood): ): _uf_union(parent, rank, idx, (r - 1) * cols + (c + 1)) + # --- Count unique regions first so region_val_buf is right-sized --- + # Reuse rank array (no longer needed after union-find) as root_to_id. + # This eliminates a separate n-element int32 allocation. + root_to_id = rank # alias; rank is dead after union-find + for i in range(n): + root_to_id[i] = 0 # clear + + n_regions = 0 + for i in range(n): + r = i // cols + c = i % cols + if not valid[r, c]: + continue + root = _uf_find(parent, i) + if root_to_id[root] == 0: + root_to_id[root] = 1 # mark as seen + n_regions += 1 + + # Allocate region_val_buf at actual region count, not pixel count. + # For a 46K x 46K raster with 100K regions this saves ~16 GB. + region_val_buf = np.full(n_regions + 1, np.nan, dtype=np.float64) + # Assign contiguous region IDs region_map_flat = np.zeros(n, dtype=np.int32) - root_to_id = np.zeros(n, dtype=np.int32) - region_val_buf = np.full(n + 1, np.nan, dtype=np.float64) + for i in range(n): + root_to_id[i] = 0 # clear for ID assignment next_id = 1 for i in range(n): @@ -319,14 +341,17 @@ def _available_memory_bytes(): def _sieve_dask(data, threshold, neighborhood, skip_values): """Dask+numpy backend: compute to numpy, sieve, wrap back.""" avail = _available_memory_bytes() - estimated_bytes = np.prod(data.shape) * data.dtype.itemsize - if estimated_bytes * 5 > 0.5 * avail: + n_pixels = np.prod(data.shape) + # Peak memory: input + result (float64 each) + parent + rank + + # region_map_flat (int32 each) = 2*8 + 3*4 = 28 bytes/pixel. + estimated_bytes = n_pixels * 28 + if estimated_bytes > 0.5 * avail: raise MemoryError( - f"sieve() needs the full array in memory " - f"(~{estimated_bytes * 5 / 1e9:.1f} GB) but only " - f"~{avail / 1e9:.1f} GB is available. Connected-component " - f"labeling is a global operation that cannot be chunked. " - f"Consider downsampling or tiling the input manually." + f"sieve() needs ~{estimated_bytes / 1e9:.1f} GB for the full " + f"array plus CCL bookkeeping, but only ~{avail / 1e9:.1f} GB " + f"is available. Connected-component labeling is a global " + f"operation that cannot be chunked. Consider downsampling " + f"or tiling the input manually." ) np_data = data.compute() @@ -338,18 +363,19 @@ def _sieve_dask(data, threshold, neighborhood, skip_values): def _sieve_dask_cupy(data, threshold, neighborhood, skip_values): """Dask+CuPy backend: compute to cupy, sieve via CPU fallback, wrap back.""" - estimated_bytes = np.prod(data.shape) * data.dtype.itemsize + n_pixels = np.prod(data.shape) + estimated_bytes = n_pixels * 28 try: import cupy as cp free_gpu, _total = cp.cuda.Device().mem_info - if estimated_bytes * 5 > 0.5 * free_gpu: + if estimated_bytes > 0.5 * free_gpu: raise MemoryError( - f"sieve() needs the full array on GPU " - f"(~{estimated_bytes * 5 / 1e9:.1f} GB) but only " - f"~{free_gpu / 1e9:.1f} GB free. Connected-component " - f"labeling is a global operation that cannot be chunked. " - f"Consider downsampling or tiling the input manually." + f"sieve() needs ~{estimated_bytes / 1e9:.1f} GB for the " + f"full array plus CCL bookkeeping, but only " + f"~{free_gpu / 1e9:.1f} GB free GPU memory. Connected-" + f"component labeling is a global operation that cannot be " + f"chunked. Consider downsampling or tiling the input." ) except (ImportError, AttributeError): pass