diff --git a/go.mod b/go.mod
index cf602914109..4f3f2999d9e 100644
--- a/go.mod
+++ b/go.mod
@@ -100,7 +100,7 @@ require (
 	github.com/ipfs/go-log/v2 v2.6.0 // indirect
 	github.com/jackpal/go-nat-pmp v1.0.2 // indirect
 	github.com/jbenet/go-temp-err-catcher v0.1.0 // indirect
-	github.com/klauspost/cpuid/v2 v2.2.10 // indirect
+	github.com/klauspost/cpuid/v2 v2.3.0
 	github.com/koron/go-ssdp v0.0.6 // indirect
 	github.com/leodido/go-urn v1.4.0 // indirect
 	github.com/libdns/libdns v0.2.2 // indirect
diff --git a/go.sum b/go.sum
index fab9e1c2f69..e90b1b1bd84 100644
--- a/go.sum
+++ b/go.sum
@@ -527,8 +527,8 @@ github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYW
 github.com/klauspost/cpuid v0.0.0-20170728055534-ae7887de9fa5/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
 github.com/klauspost/cpuid/v2 v2.0.4/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
 github.com/klauspost/cpuid/v2 v2.0.6/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
-github.com/klauspost/cpuid/v2 v2.2.10 h1:tBs3QSyvjDyFTq3uoc/9xFpCuOsJQFNPiAhYdw2skhE=
-github.com/klauspost/cpuid/v2 v2.2.10/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0=
+github.com/klauspost/cpuid/v2 v2.3.0 h1:S4CRMLnYUhGeDFDqkGriYKdfoFlDnMtqTiI/sFzhA9Y=
+github.com/klauspost/cpuid/v2 v2.3.0/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0=
 github.com/klauspost/crc32 v0.0.0-20161016154125-cb6bfca970f6/go.mod h1:+ZoRqAPRLkC4NPOvfYeR5KNOrY6TD+/sAC3HXPZgDYg=
 github.com/klauspost/pgzip v1.0.2-0.20170402124221-0bf5dcad4ada/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs=
 github.com/klauspost/reedsolomon v1.11.8 h1:s8RpUW5TK4hjr+djiOpbZJB4ksx+TdYbRH7vHQpwPOY=
diff --git a/pkg/bmt/benchmark_test.go b/pkg/bmt/benchmark_test.go
index aa6cc1703dd..0fe014bb0dc 100644
--- a/pkg/bmt/benchmark_test.go
+++ b/pkg/bmt/benchmark_test.go
@@ -29,6 +29,9 @@ func BenchmarkBMT(b *testing.B) {
 		b.Run(fmt.Sprintf("%v_size_%v", "BMT", size), func(b *testing.B) {
 			benchmarkBMT(b, size)
 		})
+		b.Run(fmt.Sprintf("%v_size_%v", "BMT_NoSIMD", size), func(b *testing.B) {
+			benchmarkBMTNoSIMD(b, size)
+		})
 	}
 }
 
@@ -87,7 +90,7 @@ func benchmarkBMT(b *testing.B, n int) {
 
 	testData := testutil.RandBytesWithSeed(b, 4096, seed)
 
-	pool := bmt.NewPool(bmt.NewConf(swarm.NewHasher, testSegmentCount, testPoolSize))
+	pool := bmt.NewPool(bmt.NewConf(testSegmentCount, testPoolSize))
 	h := pool.Get()
 	defer pool.Put(h)
 
@@ -106,7 +109,7 @@ func benchmarkPool(b *testing.B, poolsize int) {
 
 	testData := testutil.RandBytesWithSeed(b, 4096, seed)
 
-	pool := bmt.NewPool(bmt.NewConf(swarm.NewHasher, testSegmentCount, poolsize))
+	pool := bmt.NewPool(bmt.NewConf(testSegmentCount, poolsize))
 	cycles := 100
 
 	b.ReportAllocs()
@@ -127,6 +130,25 @@ func benchmarkPool(b *testing.B, poolsize int) {
 	}
 }
 
+// benchmarks BMT Hasher with SIMD disabled
+func benchmarkBMTNoSIMD(b *testing.B, n int) {
+	b.Helper()
+
+	testData := testutil.RandBytesWithSeed(b, 4096, seed)
+
+	pool := bmt.NewPool(bmt.NewConfNoSIMD(testSegmentCount, testPoolSize))
+	h := pool.Get()
+	defer pool.Put(h)
+
+	b.ReportAllocs()
+
+	for b.Loop() {
+		if _, err := syncHash(h, testData[:n]); err != nil {
+			b.Fatalf("seed %d: %v", seed, err)
+		}
+	}
+}
+
 // benchmarks the reference hasher
 func benchmarkRefHasher(b *testing.B, n int) {
 	b.Helper()
diff --git a/pkg/bmt/bmt.go b/pkg/bmt/bmt.go
index ae3c5e95421..b21584a046c 100644
--- a/pkg/bmt/bmt.go
+++ b/pkg/bmt/bmt.go
@@ -18,40 +18,6 @@ var (
 	zerosection = make([]byte, 64)
 )
 
-// Hasher is a reusable hasher for fixed maximum size chunks representing a BMT
-// It reuses a pool of trees for amortised memory allocation and resource control,
-// and supports order-agnostic concurrent segment writes and section (double segment) writes
-// as well as sequential read and write.
-//
-// The same hasher instance must not be called concurrently on more than one chunk.
-//
-// The same hasher instance is synchronously reusable.
-//
-// Sum gives back the tree to the pool and guaranteed to leave
-// the tree and itself in a state reusable for hashing a new chunk.
-type Hasher struct {
-	*Conf              // configuration
-	bmt    *tree       // prebuilt BMT resource for flowcontrol and proofs
-	size   int         // bytes written to Hasher since last Reset()
-	pos    int         // index of rightmost currently open segment
-	result chan []byte // result channel
-	errc   chan error  // error channel
-	span   []byte      // The span of the data subsumed under the chunk
-}
-
-// NewHasher gives back an instance of a Hasher struct
-func NewHasher(hasherFact func() hash.Hash) *Hasher {
-	conf := NewConf(hasherFact, swarm.BmtBranches, 32)
-
-	return &Hasher{
-		Conf:   conf,
-		result: make(chan []byte),
-		errc:   make(chan error, 1),
-		span:   make([]byte, SpanSize),
-		bmt:    newTree(conf.maxSize, conf.depth, conf.hasher),
-	}
-}
-
 // Capacity returns the maximum amount of bytes that will be processed by this hasher implementation.
 // since BMT assumes a balanced binary tree, capacity it is always a power of 2
 func (h *Hasher) Capacity() int {
@@ -91,191 +57,12 @@ func (h *Hasher) BlockSize() int {
 	return 2 * h.segmentSize
 }
 
-// Hash returns the BMT root hash of the buffer and an error
-// using Hash presupposes sequential synchronous writes (io.Writer interface).
-func (h *Hasher) Hash(b []byte) ([]byte, error) {
-	if h.size == 0 {
-		return doHash(h.hasher(), h.span, h.zerohashes[h.depth])
-	}
-	copy(h.bmt.buffer[h.size:], zerosection)
-	// write the last section with final flag set to true
-	go h.processSection(h.pos, true)
-	select {
-	case result := <-h.result:
-		return doHash(h.hasher(), h.span, result)
-	case err := <-h.errc:
-		return nil, err
-	}
-}
-
 // Sum returns the BMT root hash of the buffer, unsafe version of Hash
 func (h *Hasher) Sum(b []byte) []byte {
 	s, _ := h.Hash(b)
 	return s
 }
 
-// Write calls sequentially add to the buffer to be hashed,
-// with every full segment calls processSection in a go routine.
-func (h *Hasher) Write(b []byte) (int, error) {
-	l := len(b)
-	maxVal := h.maxSize - h.size
-	if l > maxVal {
-		l = maxVal
-	}
-	copy(h.bmt.buffer[h.size:], b)
-	secsize := 2 * h.segmentSize
-	from := h.size / secsize
-	h.size += l
-	to := h.size / secsize
-	if l == maxVal {
-		to--
-	}
-	h.pos = to
-	for i := from; i < to; i++ {
-		go h.processSection(i, false)
-	}
-	return l, nil
-}
-
-// Reset prepares the Hasher for reuse
-func (h *Hasher) Reset() {
-	h.pos = 0
-	h.size = 0
-	copy(h.span, zerospan)
-}
-
-// processSection writes the hash of i-th section into level 1 node of the BMT tree.
-func (h *Hasher) processSection(i int, final bool) {
-	secsize := 2 * h.segmentSize
-	offset := i * secsize
-	level := 1
-	// select the leaf node for the section
-	n := h.bmt.leaves[i]
-	isLeft := n.isLeft
-	hasher := n.hasher
-	n = n.parent
-	// hash the section
-	section, err := doHash(hasher, h.bmt.buffer[offset:offset+secsize])
-	if err != nil {
-		select {
-		case h.errc <- err:
-		default:
-		}
-		return
-	}
-	// write hash into parent node
-	if final {
-		// for the last segment use writeFinalNode
-		h.writeFinalNode(level, n, isLeft, section)
-	} else {
-		h.writeNode(n, isLeft, section)
-	}
-}
-
-// writeNode pushes the data to the node.
-// if it is the first of 2 sisters written, the routine terminates.
-// if it is the second, it calculates the hash and writes it
-// to the parent node recursively.
-// since hashing the parent is synchronous the same hasher can be used.
-func (h *Hasher) writeNode(n *node, isLeft bool, s []byte) {
-	var err error
-	for {
-		// at the root of the bmt just write the result to the result channel
-		if n == nil {
-			h.result <- s
-			return
-		}
-		// otherwise assign child hash to left or right segment
-		if isLeft {
-			n.left = s
-		} else {
-			n.right = s
-		}
-		// the child-thread first arriving will terminate
-		if n.toggle() {
-			return
-		}
-		// the thread coming second now can be sure both left and right children are written
-		// so it calculates the hash of left|right and pushes it to the parent
-		s, err = doHash(n.hasher, n.left, n.right)
-		if err != nil {
-			select {
-			case h.errc <- err:
-			default:
-			}
-			return
-		}
-		isLeft = n.isLeft
-		n = n.parent
-	}
-}
-
-// writeFinalNode is following the path starting from the final datasegment to the
-// BMT root via parents.
-// For unbalanced trees it fills in the missing right sister nodes using
-// the pool's lookup table for BMT subtree root hashes for all-zero sections.
-// Otherwise behaves like `writeNode`.
-func (h *Hasher) writeFinalNode(level int, n *node, isLeft bool, s []byte) {
-	var err error
-	for {
-		// at the root of the bmt just write the result to the result channel
-		if n == nil {
-			if s != nil {
-				h.result <- s
-			}
-			return
-		}
-		var noHash bool
-		if isLeft {
-			// coming from left sister branch
-			// when the final section's path is going via left child node
-			// we include an all-zero subtree hash for the right level and toggle the node.
-			n.right = h.zerohashes[level]
-			if s != nil {
-				n.left = s
-				// if a left final node carries a hash, it must be the first (and only thread)
-				// so the toggle is already in passive state no need no call
-				// yet thread needs to carry on pushing hash to parent
-				noHash = false
-			} else {
-				// if again first thread then propagate nil and calculate no hash
-				noHash = n.toggle()
-			}
-		} else {
-			// right sister branch
-			if s != nil {
-				// if hash was pushed from right child node, write right segment change state
-				n.right = s
-				// if toggle is true, we arrived first so no hashing just push nil to parent
-				noHash = n.toggle()
-			} else {
-				// if s is nil, then thread arrived first at previous node and here there will be two,
-				// so no need to do anything and keep s = nil for parent
-				noHash = true
-			}
-		}
-		// the child-thread first arriving will just continue resetting s to nil
-		// the second thread now can be sure both left and right children are written
-		// it calculates the hash of left|right and pushes it to the parent
-		if noHash {
-			s = nil
-		} else {
-			s, err = doHash(n.hasher, n.left, n.right)
-			if err != nil {
-				select {
-				case h.errc <- err:
-				default:
-				}
-				return
-			}
-		}
-		// iterate to parent
-		isLeft = n.isLeft
-		n = n.parent
-		level++
-	}
-}
-
 // calculates the Keccak256 SHA3 hash of the data
 func sha3hash(data ...[]byte) ([]byte, error) {
 	return doHash(swarm.NewHasher(), data...)
diff --git a/pkg/bmt/bmt_simd.go b/pkg/bmt/bmt_simd.go
new file mode 100644
index 00000000000..b19d2245dfd
--- /dev/null
+++ b/pkg/bmt/bmt_simd.go
@@ -0,0 +1,181 @@
+// Copyright 2024 The Swarm Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build linux && amd64 && !purego
+
+package bmt
+
+import (
+	"github.com/ethersphere/bee/v2/pkg/keccak"
+)
+
+// hashSIMD computes the BMT root hash using SIMD-accelerated Keccak hashing.
+// It processes the tree level by level from leaves to root, using batched
+// SIMD calls instead of goroutine-per-section. A single thread handles all
+// levels since SIMD already provides intra-call parallelism (4-way or 8-way).
+func (h *Hasher) hashSIMD() ([]byte, error) {
+	secsize := 2 * h.segmentSize
+	bw := h.batchWidth
+	prefixLen := len(h.prefix)
+
+	// Leaf level: hash each section and write results to parent nodes.
+	// Single-threaded: SIMD batching (4 or 8 hashes per call) replaces goroutine parallelism.
+	h.hashLeavesBatch(0, len(h.bmt.levels[0]), bw, secsize, prefixLen)
+
+	// Internal levels: process each level single-threaded (diminishing work).
+	for lvl := 1; lvl < len(h.bmt.levels)-1; lvl++ {
+		h.hashNodesBatch(h.bmt.levels[lvl], bw, prefixLen)
+	}
+
+	// Root level: hash using scalar hasher.
+	root := h.bmt.levels[len(h.bmt.levels)-1][0]
+	return doHash(root.hasher, root.left, root.right)
+}
+
+// hashLeavesBatch hashes leaf sections in the range [start, end) using SIMD batches.
+func (h *Hasher) hashLeavesBatch(start, end, bw, secsize, prefixLen int) {
+	buf := h.bmt.buffer
+
+	if bw == 8 {
+		var inputs [8][]byte
+		for i := start; i < end; i += 8 {
+			batch := 8
+			if i+batch > end {
+				batch = end - i
+			}
+			for j := 0; j < batch; j++ {
+				offset := (i + j) * secsize
+				if prefixLen > 0 {
+					copy(h.bmt.leafConcat[j][prefixLen:], buf[offset:offset+secsize])
+					inputs[j] = h.bmt.leafConcat[j][:prefixLen+secsize]
+				} else {
+					inputs[j] = buf[offset : offset+secsize]
+				}
+			}
+			for j := batch; j < 8; j++ {
+				inputs[j] = nil
+			}
+			var outputs [8]keccak.Hash256
+			if h.useSIMD {
+				outputs = keccak.Sum256x8(inputs)
+			} else {
+				outputs = keccak.Sum256x8Scalar(inputs)
+			}
+			for j := 0; j < batch; j++ {
+				leaf := h.bmt.levels[0][i+j]
+				if leaf.isLeft {
+					copy(leaf.parent.left, outputs[j][:])
+				} else {
+					copy(leaf.parent.right, outputs[j][:])
+				}
+			}
+		}
+	} else {
+		var inputs [4][]byte
+		for i := start; i < end; i += 4 {
+			batch := 4
+			if i+batch > end {
+				batch = end - i
+			}
+			for j := 0; j < batch; j++ {
+				offset := (i + j) * secsize
+				if prefixLen > 0 {
+					copy(h.bmt.leafConcat[j][prefixLen:], buf[offset:offset+secsize])
+					inputs[j] = h.bmt.leafConcat[j][:prefixLen+secsize]
+				} else {
+					inputs[j] = buf[offset : offset+secsize]
+				}
+			}
+			for j := batch; j < 4; j++ {
+				inputs[j] = nil
+			}
+			var outputs [4]keccak.Hash256
+			if h.useSIMD {
+				outputs = keccak.Sum256x4(inputs)
+			} else {
+				outputs = keccak.Sum256x4Scalar(inputs)
+			}
+			for j := 0; j < batch; j++ {
+				leaf := h.bmt.levels[0][i+j]
+				if leaf.isLeft {
+					copy(leaf.parent.left, outputs[j][:])
+				} else {
+					copy(leaf.parent.right, outputs[j][:])
+				}
+			}
+		}
+	}
+}
+
+// hashNodesBatch hashes a level of internal nodes using SIMD batches.
+// Each node's left||right (64 bytes) is hashed to produce the input for its parent.
+func (h *Hasher) hashNodesBatch(nodes []*node, bw, prefixLen int) {
+	count := len(nodes)
+	segSize := h.segmentSize
+	concat := &h.bmt.concat
+
+	if bw == 8 {
+		var inputs [8][]byte
+		for i := 0; i < count; i += 8 {
+			batch := 8
+			if i+batch > count {
+				batch = count - i
+			}
+			for j := 0; j < batch; j++ {
+				n := nodes[i+j]
+				copy(concat[j][prefixLen:prefixLen+segSize], n.left)
+				copy(concat[j][prefixLen+segSize:], n.right)
+				inputs[j] = concat[j][:prefixLen+2*segSize]
+			}
+			for j := batch; j < 8; j++ {
+				inputs[j] = nil
+			}
+			var outputs [8]keccak.Hash256
+			if h.useSIMD {
+				outputs = keccak.Sum256x8(inputs)
+			} else {
+				outputs = keccak.Sum256x8Scalar(inputs)
+			}
+			for j := 0; j < batch; j++ {
+				n := nodes[i+j]
+				if n.isLeft {
+					copy(n.parent.left, outputs[j][:])
+				} else {
+					copy(n.parent.right, outputs[j][:])
+				}
+			}
+		}
+	} else {
+		var inputs [4][]byte
+		for i := 0; i < count; i += 4 {
+			batch := 4
+			if i+batch > count {
+				batch = count - i
+			}
+			for j := 0; j < batch; j++ {
+				n := nodes[i+j]
+				copy(concat[j][prefixLen:prefixLen+segSize], n.left)
+				copy(concat[j][prefixLen+segSize:], n.right)
+				inputs[j] = concat[j][:prefixLen+2*segSize]
+			}
+			for j := batch; j < 4; j++ {
+				inputs[j] = nil
+			}
+			var outputs [4]keccak.Hash256
+			if h.useSIMD {
+				outputs = keccak.Sum256x4(inputs)
+			} else {
+				outputs = keccak.Sum256x4Scalar(inputs)
+			}
+			for j := 0; j < batch; j++ {
+				n := nodes[i+j]
+				if n.isLeft {
+					copy(n.parent.left, outputs[j][:])
+				} else {
+					copy(n.parent.right, outputs[j][:])
+				}
+			}
+		}
+	}
+}
diff --git a/pkg/bmt/bmt_test.go b/pkg/bmt/bmt_test.go
index c82fb896a6f..e22538eb3f2 100644
--- a/pkg/bmt/bmt_test.go
+++ b/pkg/bmt/bmt_test.go
@@ -67,7 +67,7 @@ func TestHasherEmptyData(t *testing.T) {
 			if err != nil {
 				t.Fatal(err)
 			}
-			pool := bmt.NewPool(bmt.NewConf(swarm.NewHasher, count, 1))
+			pool := bmt.NewPool(bmt.NewConf(count, 1))
 			h := pool.Get()
 			resHash, err := syncHash(h, nil)
 			if err != nil {
@@ -92,7 +92,7 @@ func TestSyncHasherCorrectness(t *testing.T) {
 			maxValue := count * hashSize
 			var incr int
 			capacity := 1
-			pool := bmt.NewPool(bmt.NewConf(swarm.NewHasher, count, capacity))
+			pool := bmt.NewPool(bmt.NewConf(count, capacity))
 			for n := 0; n <= maxValue; n += incr {
 				h := pool.Get()
 				incr = 1 + rand.Intn(5)
@@ -125,7 +125,7 @@ func TestHasherReuse(t *testing.T) {
 func testHasherReuse(t *testing.T, poolsize int) {
 	t.Helper()
 
-	pool := bmt.NewPool(bmt.NewConf(swarm.NewHasher, testSegmentCount, poolsize))
+	pool := bmt.NewPool(bmt.NewConf(testSegmentCount, poolsize))
 	h := pool.Get()
 	defer pool.Put(h)
 
@@ -145,7 +145,7 @@ func TestBMTConcurrentUse(t *testing.T) {
 	t.Parallel()
 
 	testData := testutil.RandBytesWithSeed(t, 4096, seed)
-	pool := bmt.NewPool(bmt.NewConf(swarm.NewHasher, testSegmentCount, testPoolSize))
+	pool := bmt.NewPool(bmt.NewConf(testSegmentCount, testPoolSize))
 	cycles := 100
 
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
@@ -178,7 +178,7 @@ func TestBMTWriterBuffers(t *testing.T) {
 		t.Run(fmt.Sprintf("%d_segments", count), func(t *testing.T) {
 			t.Parallel()
 
-			pool := bmt.NewPool(bmt.NewConf(swarm.NewHasher, count, testPoolSize))
+			pool := bmt.NewPool(bmt.NewConf(count, testPoolSize))
 			h := pool.Get()
 			defer pool.Put(h)
 
@@ -275,7 +275,7 @@ func testHasherCorrectness(h *bmt.Hasher, data []byte, n, count int) (err error)
 func TestUseSyncAsOrdinaryHasher(t *testing.T) {
 	t.Parallel()
 
-	pool := bmt.NewPool(bmt.NewConf(swarm.NewHasher, testSegmentCount, testPoolSize))
+	pool := bmt.NewPool(bmt.NewConf(testSegmentCount, testPoolSize))
 	h := pool.Get()
 	defer pool.Put(h)
 	data := []byte("moodbytesmoodbytesmoodbytesmoodbytes")
diff --git a/pkg/bmt/export_goroutine_test.go b/pkg/bmt/export_goroutine_test.go
new file mode 100644
index 00000000000..26f48b88739
--- /dev/null
+++ b/pkg/bmt/export_goroutine_test.go
@@ -0,0 +1,13 @@
+// Copyright 2021 The Swarm Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !linux || !amd64 || purego
+
+package bmt
+
+// NewConfNoSIMD on the goroutine path just returns a regular Conf
+// since there is no SIMD to disable.
+func NewConfNoSIMD(segmentCount, capacity int) *Conf {
+	return NewConf(segmentCount, capacity)
+}
diff --git a/pkg/bmt/export_simd_test.go b/pkg/bmt/export_simd_test.go
new file mode 100644
index 00000000000..f03e52d788f
--- /dev/null
+++ b/pkg/bmt/export_simd_test.go
@@ -0,0 +1,16 @@
+// Copyright 2021 The Swarm Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build linux && amd64 && !purego
+
+package bmt
+
+// NewConfNoSIMD creates a Conf identical to NewConf but with SIMD disabled,
+// useful for benchmarking the non-SIMD path.
+func NewConfNoSIMD(segmentCount, capacity int) *Conf {
+	c := NewConf(segmentCount, capacity)
+	c.useSIMD = false
+	c.batchWidth = 8 // use 8-wide batching with scalar fallback
+	return c
+}
diff --git a/pkg/bmt/hasher_goroutine.go b/pkg/bmt/hasher_goroutine.go
new file mode 100644
index 00000000000..d18728dfee7
--- /dev/null
+++ b/pkg/bmt/hasher_goroutine.go
@@ -0,0 +1,234 @@
+// Copyright 2021 The Swarm Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !linux || !amd64 || purego
+
+package bmt
+
+import (
+	"github.com/ethersphere/bee/v2/pkg/swarm"
+)
+
+// Hasher is a reusable hasher for fixed maximum size chunks representing a BMT.
+// This implementation uses goroutine-based concurrent tree traversal.
+//
+// It reuses a pool of trees for amortised memory allocation and resource control,
+// and supports order-agnostic concurrent segment writes and section (double segment) writes
+// as well as sequential read and write.
+//
+// The same hasher instance must not be called concurrently on more than one chunk.
+//
+// The same hasher instance is synchronously reusable.
+//
+// Sum gives back the tree to the pool and guaranteed to leave
+// the tree and itself in a state reusable for hashing a new chunk.
+type Hasher struct {
+	*Conf              // configuration
+	bmt    *tree       // prebuilt BMT resource for flowcontrol and proofs
+	size   int         // bytes written to Hasher since last Reset()
+	pos    int         // index of rightmost currently open segment
+	result chan []byte // result channel
+	errc   chan error  // error channel
+	span   []byte      // The span of the data subsumed under the chunk
+}
+
+// NewHasher gives back an instance of a Hasher struct
+func NewHasher() *Hasher {
+	return newHasherWithConf(NewConf(swarm.BmtBranches, 32))
+}
+
+// NewPrefixHasher gives back an instance of a Hasher struct with the given prefix
+// prepended to every hash operation.
+func NewPrefixHasher(prefix []byte) *Hasher {
+	return newHasherWithConf(NewConfWithPrefix(prefix, swarm.BmtBranches, 32))
+}
+
+func newHasherWithConf(conf *Conf) *Hasher {
+	return &Hasher{
+		Conf:   conf,
+		result: make(chan []byte),
+		errc:   make(chan error, 1),
+		span:   make([]byte, SpanSize),
+		bmt:    newTree(conf.maxSize, conf.depth, conf.hasherFunc),
+	}
+}
+
+// Write calls sequentially add to the buffer to be hashed,
+// with every full segment calls processSection in a go routine.
+func (h *Hasher) Write(b []byte) (int, error) {
+	l := len(b)
+	maxVal := h.maxSize - h.size
+	if l > maxVal {
+		l = maxVal
+	}
+	copy(h.bmt.buffer[h.size:], b)
+	secsize := 2 * h.segmentSize
+	from := h.size / secsize
+	h.size += l
+	to := h.size / secsize
+	if l == maxVal {
+		to--
+	}
+	h.pos = to
+	for i := from; i < to; i++ {
+		go h.processSection(i, false)
+	}
+	return l, nil
+}
+
+// Hash returns the BMT root hash of the buffer and an error
+// using Hash presupposes sequential synchronous writes (io.Writer interface).
+func (h *Hasher) Hash(b []byte) ([]byte, error) {
+	if h.size == 0 {
+		return doHash(h.baseHasher(), h.span, h.zerohashes[h.depth])
+	}
+	copy(h.bmt.buffer[h.size:], zerosection)
+	// write the last section with final flag set to true
+	go h.processSection(h.pos, true)
+	select {
+	case result := <-h.result:
+		return doHash(h.baseHasher(), h.span, result)
+	case err := <-h.errc:
+		return nil, err
+	}
+}
+
+// Reset prepares the Hasher for reuse
+func (h *Hasher) Reset() {
+	h.pos = 0
+	h.size = 0
+	copy(h.span, zerospan)
+}
+
+// processSection writes the hash of i-th section into level 1 node of the BMT tree.
+func (h *Hasher) processSection(i int, final bool) {
+	secsize := 2 * h.segmentSize
+	offset := i * secsize
+	level := 1
+	// select the leaf node for the section
+	n := h.bmt.leaves[i]
+	isLeft := n.isLeft
+	hasher := n.hasher
+	n = n.parent
+	// hash the section
+	section, err := doHash(hasher, h.bmt.buffer[offset:offset+secsize])
+	if err != nil {
+		select {
+		case h.errc <- err:
+		default:
+		}
+		return
+	}
+	// write hash into parent node
+	if final {
+		// for the last segment use writeFinalNode
+		h.writeFinalNode(level, n, isLeft, section)
+	} else {
+		h.writeNode(n, isLeft, section)
+	}
+}
+
+// writeNode pushes the data to the node.
+// if it is the first of 2 sisters written, the routine terminates.
+// if it is the second, it calculates the hash and writes it
+// to the parent node recursively.
+// since hashing the parent is synchronous the same hasher can be used.
+func (h *Hasher) writeNode(n *node, isLeft bool, s []byte) {
+	var err error
+	for {
+		// at the root of the bmt just write the result to the result channel
+		if n == nil {
+			h.result <- s
+			return
+		}
+		// otherwise assign child hash to left or right segment
+		if isLeft {
+			n.left = s
+		} else {
+			n.right = s
+		}
+		// the child-thread first arriving will terminate
+		if n.toggle() {
+			return
+		}
+		// the thread coming second now can be sure both left and right children are written
+		// so it calculates the hash of left|right and pushes it to the parent
+		s, err = doHash(n.hasher, n.left, n.right)
+		if err != nil {
+			select {
+			case h.errc <- err:
+			default:
+			}
+			return
+		}
+		isLeft = n.isLeft
+		n = n.parent
+	}
+}
+
+// writeFinalNode is following the path starting from the final datasegment to the
+// BMT root via parents.
+// For unbalanced trees it fills in the missing right sister nodes using
+// the pool's lookup table for BMT subtree root hashes for all-zero sections.
+// Otherwise behaves like `writeNode`.
+func (h *Hasher) writeFinalNode(level int, n *node, isLeft bool, s []byte) {
+	var err error
+	for {
+		// at the root of the bmt just write the result to the result channel
+		if n == nil {
+			if s != nil {
+				h.result <- s
+			}
+			return
+		}
+		var noHash bool
+		if isLeft {
+			// coming from left sister branch
+			// when the final section's path is going via left child node
+			// we include an all-zero subtree hash for the right level and toggle the node.
+			n.right = h.zerohashes[level]
+			if s != nil {
+				n.left = s
+				// if a left final node carries a hash, it must be the first (and only thread)
+				// so the toggle is already in passive state no need no call
+				// yet thread needs to carry on pushing hash to parent
+				noHash = false
+			} else {
+				// if again first thread then propagate nil and calculate no hash
+				noHash = n.toggle()
+			}
+		} else {
+			// right sister branch
+			if s != nil {
+				// if hash was pushed from right child node, write right segment change state
+				n.right = s
+				// if toggle is true, we arrived first so no hashing just push nil to parent
+				noHash = n.toggle()
+			} else {
+				// if s is nil, then thread arrived first at previous node and here there will be two,
+				// so no need to do anything and keep s = nil for parent
+				noHash = true
+			}
+		}
+		// the child-thread first arriving will just continue resetting s to nil
+		// the second thread now can be sure both left and right children are written
+		// it calculates the hash of left|right and pushes it to the parent
+		if noHash {
+			s = nil
+		} else {
+			s, err = doHash(n.hasher, n.left, n.right)
+			if err != nil {
+				select {
+				case h.errc <- err:
+				default:
+				}
+				return
+			}
+		}
+		// iterate to parent
+		isLeft = n.isLeft
+		n = n.parent
+		level++
+	}
+}
diff --git a/pkg/bmt/hasher_simd.go b/pkg/bmt/hasher_simd.go
new file mode 100644
index 00000000000..ef3396482d7
--- /dev/null
+++ b/pkg/bmt/hasher_simd.go
@@ -0,0 +1,90 @@
+// Copyright 2021 The Swarm Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build linux && amd64 && !purego
+
+package bmt
+
+import (
+	"github.com/ethersphere/bee/v2/pkg/swarm"
+)
+
+// Hasher is a reusable hasher for fixed maximum size chunks representing a BMT.
+// This implementation buffers all data and defers hashing to Hash(),
+// using SIMD-accelerated Keccak for multi-level trees.
+//
+// The same hasher instance must not be called concurrently on more than one chunk.
+//
+// The same hasher instance is synchronously reusable.
+type Hasher struct {
+	*Conf        // configuration
+	bmt   *tree  // prebuilt BMT resource for flowcontrol and proofs
+	size  int    // bytes written to Hasher since last Reset()
+	span  []byte // The span of the data subsumed under the chunk
+}
+
+// NewHasher gives back an instance of a Hasher struct
+func NewHasher() *Hasher {
+	return newHasherWithConf(NewConf(swarm.BmtBranches, 32))
+}
+
+// NewPrefixHasher gives back an instance of a Hasher struct with the given prefix
+// prepended to every hash operation.
+func NewPrefixHasher(prefix []byte) *Hasher {
+	return newHasherWithConf(NewConfWithPrefix(prefix, swarm.BmtBranches, 32))
+}
+
+func newHasherWithConf(conf *Conf) *Hasher {
+	return &Hasher{
+		Conf: conf,
+		span: make([]byte, SpanSize),
+		bmt:  newTree(conf.maxSize, conf.depth, conf.baseHasher, conf.prefix),
+	}
+}
+
+// Write calls sequentially add to the buffer to be hashed.
+// All hashing is deferred to Hash().
+func (h *Hasher) Write(b []byte) (int, error) {
+	l := len(b)
+	maxVal := h.maxSize - h.size
+	if l > maxVal {
+		l = maxVal
+	}
+	copy(h.bmt.buffer[h.size:], b)
+	h.size += l
+	return l, nil
+}
+
+// Hash returns the BMT root hash of the buffer and an error
+// using Hash presupposes sequential synchronous writes (io.Writer interface).
+func (h *Hasher) Hash(b []byte) ([]byte, error) {
+	if h.size == 0 {
+		return doHash(h.baseHasher(), h.span, h.zerohashes[h.depth])
+	}
+	// zero-fill remainder so all sections have deterministic input
+	for i := h.size; i < h.maxSize; i++ {
+		h.bmt.buffer[i] = 0
+	}
+	if len(h.bmt.levels) == 1 {
+		// single-level tree: hash the only section directly
+		secsize := 2 * h.segmentSize
+		root := h.bmt.levels[0][0]
+		rootHash, err := doHash(root.hasher, h.bmt.buffer[:secsize])
+		if err != nil {
+			return nil, err
+		}
+		return doHash(h.baseHasher(), h.span, rootHash)
+	}
+	rootHash, err := h.hashSIMD()
+	if err != nil {
+		return nil, err
+	}
+	return doHash(h.baseHasher(), h.span, rootHash)
+}
+
+// Reset prepares the Hasher for reuse
+func (h *Hasher) Reset() {
+	h.size = 0
+	copy(h.span, zerospan)
+}
diff --git a/pkg/bmt/pool.go b/pkg/bmt/pool_goroutine.go
similarity index 71%
rename from pkg/bmt/pool.go
rename to pkg/bmt/pool_goroutine.go
index a7f7245c40d..9a325ee57a9 100644
--- a/pkg/bmt/pool.go
+++ b/pkg/bmt/pool_goroutine.go
@@ -2,26 +2,37 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !linux || !amd64 || purego
+
 package bmt
 
 import (
 	"hash"
 	"sync/atomic"
+
+	"github.com/ethersphere/bee/v2/pkg/swarm"
 )
 
-// BaseHasherFunc is a hash.Hash constructor function used for the base hash of the BMT.
-// implemented by Keccak256 SHA3 sha3.NewLegacyKeccak256
-type BaseHasherFunc func() hash.Hash
+const SEGMENT_SIZE = 32
 
 // configuration
 type Conf struct {
-	segmentSize  int            // size of leaf segments, stipulated to be = hash size
-	segmentCount int            // the number of segments on the base level of the BMT
-	capacity     int            // pool capacity, controls concurrency
-	depth        int            // depth of the bmt trees = int(log2(segmentCount))+1
-	maxSize      int            // the total length of the data (count * size)
-	zerohashes   [][]byte       // lookup table for predictable padding subtrees for all levels
-	hasher       BaseHasherFunc // base hasher to use for the BMT levels
+	segmentSize  int      // size of leaf segments, stipulated to be = hash size
+	segmentCount int      // the number of segments on the base level of the BMT
+	capacity     int      // pool capacity, controls concurrency
+	depth        int      // depth of the bmt trees = int(log2(segmentCount))+1
+	maxSize      int      // the total length of the data (count * size)
+	zerohashes   [][]byte // lookup table for predictable padding subtrees for all levels
+	prefix       []byte   // optional prefix prepended to every hash operation
+	hasherFunc   func() hash.Hash
+}
+
+// baseHasher returns a new base hasher instance, optionally with prefix.
+func (c *Conf) baseHasher() hash.Hash {
+	if len(c.prefix) > 0 {
+		return swarm.NewPrefixHasher(c.prefix)
+	}
+	return swarm.NewHasher()
 }
 
 // Pool provides a pool of trees used as resources by the BMT Hasher.
@@ -32,29 +43,49 @@ type Pool struct {
 	*Conf            // configuration
 }
 
-func NewConf(hasher BaseHasherFunc, segmentCount, capacity int) *Conf {
+func NewConf(segmentCount, capacity int) *Conf {
+	return newConf(nil, segmentCount, capacity)
+}
+
+func NewConfWithPrefix(prefix []byte, segmentCount, capacity int) *Conf {
+	return newConf(prefix, segmentCount, capacity)
+}
+
+func newConf(prefix []byte, segmentCount, capacity int) *Conf {
 	count, depth := sizeToParams(segmentCount)
-	segmentSize := hasher().Size()
+	segmentSize := SEGMENT_SIZE
+
+	hasherFunc := func() hash.Hash {
+		if len(prefix) > 0 {
+			return swarm.NewPrefixHasher(prefix)
+		}
+		return swarm.NewHasher()
+	}
+
+	c := &Conf{
+		segmentSize:  segmentSize,
+		segmentCount: segmentCount,
+		capacity:     capacity,
+		maxSize:      count * segmentSize,
+		depth:        depth,
+		prefix:       prefix,
+		hasherFunc:   hasherFunc,
+	}
+
 	zerohashes := make([][]byte, depth+1)
 	zeros := make([]byte, segmentSize)
 	zerohashes[0] = zeros
 	var err error
 	// initialises the zerohashes lookup table
 	for i := 1; i < depth+1; i++ {
-		if zeros, err = doHash(hasher(), zeros, zeros); err != nil {
+		if zeros, err = doHash(c.baseHasher(), zeros, zeros); err != nil {
 			panic(err.Error())
 		}
 		zerohashes[i] = zeros
 	}
-	return &Conf{
-		hasher:       hasher,
-		segmentSize:  segmentSize,
-		segmentCount: segmentCount,
-		capacity:     capacity,
-		maxSize:      count * segmentSize,
-		depth:        depth,
-		zerohashes:   zerohashes,
-	}
+	c.zerohashes = zerohashes
+
+	return c
 }
 
 // NewPool creates a tree pool with hasher, segment size, segment count and capacity
@@ -65,7 +96,7 @@ func NewPool(c *Conf) *Pool {
 		c:    make(chan *tree, c.capacity),
 	}
 	for i := 0; i < c.capacity; i++ {
-		p.c <- newTree(p.maxSize, p.depth, p.hasher)
+		p.c <- newTree(p.maxSize, p.depth, c.hasherFunc)
 	}
 	return p
 }
diff --git a/pkg/bmt/pool_simd.go b/pkg/bmt/pool_simd.go
new file mode 100644
index 00000000000..78c7061c6ac
--- /dev/null
+++ b/pkg/bmt/pool_simd.go
@@ -0,0 +1,209 @@
+// Copyright 2021 The Swarm Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build linux && amd64 && !purego
+
+package bmt
+
+import (
+	"hash"
+
+	"github.com/ethersphere/bee/v2/pkg/keccak"
+	"github.com/ethersphere/bee/v2/pkg/swarm"
+)
+
+const SEGMENT_SIZE = 32
+
+// configuration
+type Conf struct {
+	segmentSize  int      // size of leaf segments, stipulated to be = hash size
+	segmentCount int      // the number of segments on the base level of the BMT
+	capacity     int      // pool capacity, controls concurrency
+	depth        int      // depth of the bmt trees = int(log2(segmentCount))+1
+	maxSize      int      // the total length of the data (count * size)
+	zerohashes   [][]byte // lookup table for predictable padding subtrees for all levels
+	prefix       []byte   // optional prefix prepended to every hash operation
+	useSIMD      bool     // whether SIMD keccak is available
+	batchWidth   int      // 4 (AVX2), 8 (AVX-512), or 0
+}
+
+// baseHasher returns a new base hasher instance, optionally with prefix.
+func (c *Conf) baseHasher() hash.Hash {
+	if len(c.prefix) > 0 {
+		return swarm.NewPrefixHasher(c.prefix)
+	}
+	return swarm.NewHasher()
+}
+
+// Pool provides a pool of trees used as resources by the BMT Hasher.
+// A tree popped from the pool is guaranteed to have a clean state ready
+// for hashing a new chunk.
+type Pool struct {
+	c     chan *tree // the channel to obtain a resource from the pool
+	*Conf            // configuration
+}
+
+func NewConf(segmentCount, capacity int) *Conf {
+	return newConf(nil, segmentCount, capacity)
+}
+
+func NewConfWithPrefix(prefix []byte, segmentCount, capacity int) *Conf {
+	return newConf(prefix, segmentCount, capacity)
+}
+
+func newConf(prefix []byte, segmentCount, capacity int) *Conf {
+	count, depth := sizeToParams(segmentCount)
+	segmentSize := SEGMENT_SIZE
+
+	c := &Conf{
+		segmentSize:  segmentSize,
+		segmentCount: segmentCount,
+		capacity:     capacity,
+		maxSize:      count * segmentSize,
+		depth:        depth,
+		prefix:       prefix,
+		useSIMD:      keccak.HasSIMD(),
+	}
+
+	bw := keccak.BatchWidth()
+	if bw == 0 {
+		bw = 8 // use 4-wide batching with scalar fallback
+	}
+	c.batchWidth = bw
+
+	zerohashes := make([][]byte, depth+1)
+	zeros := make([]byte, segmentSize)
+	zerohashes[0] = zeros
+	var err error
+	// initialises the zerohashes lookup table
+	for i := 1; i < depth+1; i++ {
+		if zeros, err = doHash(c.baseHasher(), zeros, zeros); err != nil {
+			panic(err.Error())
+		}
+		zerohashes[i] = zeros
+	}
+	c.zerohashes = zerohashes
+
+	return c
+}
+
+// NewPool creates a tree pool with hasher, segment size, segment count and capacity
+// it reuses free trees or creates a new one if capacity is not reached.
+func NewPool(c *Conf) *Pool {
+	p := &Pool{
+		Conf: c,
+		c:    make(chan *tree, c.capacity),
+	}
+	for i := 0; i < c.capacity; i++ {
+		p.c <- newTree(p.maxSize, p.depth, c.baseHasher, c.prefix)
+	}
+	return p
+}
+
+// Get returns a BMT hasher possibly reusing a tree from the pool
+func (p *Pool) Get() *Hasher {
+	t := <-p.c
+	return &Hasher{
+		Conf: p.Conf,
+		span: make([]byte, SpanSize),
+		bmt:  t,
+	}
+}
+
+// Put is called after using a bmt hasher to return the tree to a pool for reuse
+func (p *Pool) Put(h *Hasher) {
+	p.c <- h.bmt
+}
+
+// tree is a reusable control structure representing a BMT
+// organised in a binary tree
+//
+// Hasher uses a Pool to obtain a tree for each chunk hash
+// the tree is 'locked' while not in the pool.
+type tree struct {
+	leaves     []*node   // leaf nodes of the tree, other nodes accessible via parent links
+	levels     [][]*node // levels[0]=leaves, levels[1]=parents of leaves, ..., levels[depth-1]=root
+	buffer     []byte
+	concat     [8][]byte // reusable concat buffers for SIMD node hashing
+	leafConcat [8][]byte // reusable concat buffers for SIMD leaf hashing
+}
+
+// node is a reusable segment hasher representing a node in a BMT.
+type node struct {
+	isLeft      bool      // whether it is left side of the parent double segment
+	parent      *node     // pointer to parent node in the BMT
+	left, right []byte    // this is where the two children sections are written
+	hasher      hash.Hash // preconstructed hasher on nodes
+}
+
+// newNode constructs a segment hasher node in the BMT (used by newTree).
+func newNode(index int, parent *node, hasher hash.Hash) *node {
+	return &node{
+		parent: parent,
+		isLeft: index%2 == 0,
+		hasher: hasher,
+		left:   make([]byte, hasher.Size()),
+		right:  make([]byte, hasher.Size()),
+	}
+}
+
+// newTree initialises a tree by building up the nodes of a BMT
+func newTree(maxsize, depth int, hashfunc func() hash.Hash, prefix []byte) *tree {
+	prefixLen := len(prefix)
+	n := newNode(0, nil, hashfunc())
+	prevlevel := []*node{n}
+	// collect levels top-down during construction, then reverse
+	allLevels := [][]*node{prevlevel}
+	// iterate over levels and creates 2^(depth-level) nodes
+	// the 0 level is on double segment sections so we start at depth - 2
+	count := 2
+	for level := depth - 2; level >= 0; level-- {
+		nodes := make([]*node, count)
+		for i := 0; i < count; i++ {
+			parent := prevlevel[i/2]
+			nodes[i] = newNode(i, parent, hashfunc())
+		}
+		allLevels = append(allLevels, nodes)
+		prevlevel = nodes
+		count *= 2
+	}
+	// reverse so levels[0]=leaves, levels[len-1]=root
+	for i, j := 0, len(allLevels)-1; i < j; i, j = i+1, j-1 {
+		allLevels[i], allLevels[j] = allLevels[j], allLevels[i]
+	}
+	// pre-allocate concat buffers for SIMD hashing (with space for optional prefix)
+	segSize := hashfunc().Size()
+	bufSize := prefixLen + 2*segSize
+	var concat [8][]byte
+	for i := range concat {
+		concat[i] = make([]byte, bufSize)
+		if prefixLen > 0 {
+			copy(concat[i][:prefixLen], prefix)
+		}
+	}
+	var leafConcat [8][]byte
+	for i := range leafConcat {
+		leafConcat[i] = make([]byte, prefixLen+2*segSize)
+		if prefixLen > 0 {
+			copy(leafConcat[i][:prefixLen], prefix)
+		}
+	}
+	// the datanode level is the nodes on the last level
+	return &tree{
+		leaves:     prevlevel,
+		levels:     allLevels,
+		buffer:     make([]byte, maxsize),
+		concat:     concat,
+		leafConcat: leafConcat,
+	}
+}
+
+// sizeToParams calculates the depth (number of levels) and segment count in the BMT tree.
+func sizeToParams(n int) (c, d int) {
+	c = 2
+	for ; c < n; c *= 2 {
+		d++
+	}
+	return c, d + 1
+}
diff --git a/pkg/bmt/proof.go b/pkg/bmt/proof.go
index b08017b43cd..4b1eda7351f 100644
--- a/pkg/bmt/proof.go
+++ b/pkg/bmt/proof.go
@@ -17,7 +17,8 @@ type Proof struct {
 	Index         int
 }
 
-// Hash overrides base hash function of Hasher to fill buffer with zeros until chunk length
+// Hash overrides base hash function of Hasher to fill buffer with zeros until chunk length.
+// It always pads with zero sections so that the proof tree is fully populated.
 func (p Prover) Hash(b []byte) ([]byte, error) {
 	for i := p.size; i < p.maxSize; i += len(zerosection) {
 		_, err := p.Write(zerosection)
@@ -67,7 +68,7 @@ func (p Prover) Verify(i int, proof Proof) (root []byte, err error) {
 	}
 	i = i / 2
 	n := p.bmt.leaves[i]
-	hasher := p.hasher()
+	hasher := p.baseHasher()
 	isLeft := n.isLeft
 	root, err = doHash(hasher, section)
 	if err != nil {
diff --git a/pkg/bmt/proof_test.go b/pkg/bmt/proof_test.go
index 5b3625acefb..edf91c192a7 100644
--- a/pkg/bmt/proof_test.go
+++ b/pkg/bmt/proof_test.go
@@ -46,7 +46,7 @@ func TestProofCorrectness(t *testing.T) {
 		}
 	}
 
-	pool := bmt.NewPool(bmt.NewConf(swarm.NewHasher, 128, 128))
+	pool := bmt.NewPool(bmt.NewConf(128, 128))
 	hh := pool.Get()
 	t.Cleanup(func() {
 		pool.Put(hh)
@@ -191,7 +191,7 @@ func TestProof(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	pool := bmt.NewPool(bmt.NewConf(swarm.NewHasher, 128, 128))
+	pool := bmt.NewPool(bmt.NewConf(128, 128))
 	hh := pool.Get()
 	t.Cleanup(func() {
 		pool.Put(hh)
diff --git a/pkg/bmtpool/bmtpool.go b/pkg/bmtpool/bmtpool.go
index 88c1ad32dba..06bdcaa04b4 100644
--- a/pkg/bmtpool/bmtpool.go
+++ b/pkg/bmtpool/bmtpool.go
@@ -17,7 +17,7 @@ var instance *bmt.Pool
 
 // nolint:gochecknoinits
 func init() {
-	instance = bmt.NewPool(bmt.NewConf(swarm.NewHasher, swarm.BmtBranches, Capacity))
+	instance = bmt.NewPool(bmt.NewConf(swarm.BmtBranches, Capacity))
 }
 
 // Get a bmt Hasher instance.
diff --git a/pkg/keccak/keccak.go b/pkg/keccak/keccak.go
new file mode 100644
index 00000000000..24b3ba58727
--- /dev/null
+++ b/pkg/keccak/keccak.go
@@ -0,0 +1,149 @@
+// Copyright 2024 The Swarm Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package keccak provides legacy Keccak-256 (Ethereum-compatible) hashing
+// with SIMD acceleration via XKCP.
+//
+// On amd64, the package automatically selects between AVX-512 (8-way parallel)
+// and AVX2 (4-way parallel) based on the CPU's capabilities.
+package keccak
+
+import (
+	"encoding/hex"
+	"sync"
+
+	"golang.org/x/crypto/sha3"
+)
+
+// Hash256 represents a 32-byte Keccak-256 hash
+type Hash256 [32]byte
+
+// HexString returns the hash as a hexadecimal string
+func (h Hash256) HexString() string {
+	return hex.EncodeToString(h[:])
+}
+
+// HasAVX512 reports whether the CPU supports AVX-512 (F + VL) and the
+// AVX-512 code path is available.
+func HasAVX512() bool {
+	return hasAVX512
+}
+
+// HasSIMD reports whether any SIMD-accelerated Keccak path is available
+// (AVX2 or AVX-512).
+func HasSIMD() bool {
+	return hasAVX2
+}
+
+// BatchWidth returns the SIMD batch width: 8 for AVX-512, 4 for AVX2, or 0
+// if no SIMD acceleration is available.
+func BatchWidth() int {
+	if hasAVX512 {
+		return 8
+	}
+	if hasAVX2 {
+		return 4
+	}
+	return 0
+}
+
+// Sum256 computes a single Keccak-256 hash (legacy, Ethereum-compatible).
+// Uses the best available implementation.
+func Sum256(data []byte) Hash256 {
+	return sum256Scalar(data)
+}
+
+// Sum256x4 computes 4 Keccak-256 hashes in parallel using AVX2.
+// Falls back to scalar if AVX2 is not available.
+func Sum256x4(inputs [4][]byte) [4]Hash256 {
+	if !hasAVX2 {
+		return sum256x4Scalar(inputs)
+	}
+	var outputs [4]Hash256
+	var inputsCopy [4][]byte
+	copy(inputsCopy[:], inputs[:])
+	keccak256x4(&inputsCopy, &outputs)
+	return outputs
+}
+
+// Sum256x4Scalar computes 4 Keccak-256 hashes using the scalar path,
+// bypassing SIMD detection. Used by callers that explicitly want non-SIMD.
+func Sum256x4Scalar(inputs [4][]byte) [4]Hash256 {
+	return sum256x4Scalar(inputs)
+}
+
+// Sum256x8 computes 8 Keccak-256 hashes in parallel using AVX-512.
+// Falls back to scalar if AVX-512 is not available.
+func Sum256x8(inputs [8][]byte) [8]Hash256 {
+	if !hasAVX512 {
+		return sum256x8Scalar(inputs)
+	}
+	var outputs [8]Hash256
+	var inputsCopy [8][]byte
+	copy(inputsCopy[:], inputs[:])
+	keccak256x8(&inputsCopy, &outputs)
+	return outputs
+}
+
+// Sum256x8Scalar computes 8 Keccak-256 hashes using the scalar path,
+// bypassing SIMD detection. Used by callers that explicitly want non-SIMD.
+func Sum256x8Scalar(inputs [8][]byte) [8]Hash256 {
+	return sum256x8Scalar(inputs)
+}
+
+func sum256Scalar(data []byte) Hash256 {
+	var out Hash256
+	h := sha3.NewLegacyKeccak256()
+	h.Write(data)
+	copy(out[:], h.Sum(nil))
+	return out
+}
+
+func sum256x4Scalar(inputs [4][]byte) [4]Hash256 {
+	var outputs [4]Hash256
+	var wg sync.WaitGroup
+	var mu sync.Mutex
+	for i := 0; i < 4; i++ {
+		if inputs[i] == nil {
+			continue
+		}
+		wg.Add(1)
+
+		go func() {
+			defer wg.Done()
+			h := sha3.NewLegacyKeccak256()
+			h.Write(inputs[i])
+			result := h.Sum(nil)
+			mu.Lock()
+			copy(outputs[i][:], result)
+			mu.Unlock()
+		}()
+	}
+	wg.Wait()
+	return outputs
+}
+
+func sum256x8Scalar(inputs [8][]byte) [8]Hash256 {
+	var outputs [8]Hash256
+	var wg sync.WaitGroup
+	var mu sync.Mutex
+
+	for i := 0; i < 8; i++ {
+		if inputs[i] == nil {
+			continue
+		}
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			h := sha3.NewLegacyKeccak256()
+			h.Write(inputs[i])
+			result := h.Sum(nil)
+			mu.Lock()
+			copy(outputs[i][:], result)
+			mu.Unlock()
+		}()
+	}
+	wg.Wait()
+	return outputs
+}
diff --git a/pkg/keccak/keccak_amd64.go b/pkg/keccak/keccak_amd64.go
new file mode 100644
index 00000000000..2894b54080c
--- /dev/null
+++ b/pkg/keccak/keccak_amd64.go
@@ -0,0 +1,9 @@
+//go:build linux && amd64 && !purego
+
+package keccak
+
+//go:noescape
+func keccak256x4(inputs *[4][]byte, outputs *[4]Hash256)
+
+//go:noescape
+func keccak256x8(inputs *[8][]byte, outputs *[8]Hash256)
diff --git a/pkg/keccak/keccak_cpu.go b/pkg/keccak/keccak_cpu.go
new file mode 100644
index 00000000000..7ee62db10e8
--- /dev/null
+++ b/pkg/keccak/keccak_cpu.go
@@ -0,0 +1,14 @@
+// Copyright 2024 The Swarm Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build linux && amd64 && !purego
+
+package keccak
+
+import cpuid "github.com/klauspost/cpuid/v2"
+
+var (
+	hasAVX2   = cpuid.CPU.Supports(cpuid.AVX2)
+	hasAVX512 = cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512VL)
+)
diff --git a/pkg/keccak/keccak_cpu_other.go b/pkg/keccak/keccak_cpu_other.go
new file mode 100644
index 00000000000..0e0966aad73
--- /dev/null
+++ b/pkg/keccak/keccak_cpu_other.go
@@ -0,0 +1,14 @@
+// Copyright 2024 The Swarm Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !linux || !amd64 || purego
+
+package keccak
+
+// No SIMD Keccak implementations are available on this platform;
+// Sum256x4/Sum256x8 will fall back to the scalar goroutine path.
+var (
+	hasAVX2   = false
+	hasAVX512 = false
+)
diff --git a/pkg/keccak/keccak_other.go b/pkg/keccak/keccak_other.go
new file mode 100644
index 00000000000..37e5b29caac
--- /dev/null
+++ b/pkg/keccak/keccak_other.go
@@ -0,0 +1,11 @@
+//go:build !linux || !amd64 || purego
+
+package keccak
+
+func keccak256x4(_ *[4][]byte, _ *[4]Hash256) {
+	panic("keccak: SIMD not available on this platform")
+}
+
+func keccak256x8(_ *[8][]byte, _ *[8]Hash256) {
+	panic("keccak: SIMD not available on this platform")
+}
diff --git a/pkg/keccak/keccak_times4_amd64.s b/pkg/keccak/keccak_times4_amd64.s
new file mode 100644
index 00000000000..6d6f385f45f
--- /dev/null
+++ b/pkg/keccak/keccak_times4_amd64.s
@@ -0,0 +1,11 @@
+//go:build linux && amd64 && !purego
+
+#include "textflag.h"
+
+// func keccak256x4(inputs *[4][]byte, outputs *[4]Hash256)
+TEXT ·keccak256x4(SB), $16384-16
+    MOVQ inputs+0(FP), DI
+    MOVQ outputs+8(FP), SI
+    CALL go_keccak256x4(SB)
+		VZEROUPPER
+    RET
diff --git a/pkg/keccak/keccak_times4_linux_amd64.syso b/pkg/keccak/keccak_times4_linux_amd64.syso
new file mode 100644
index 00000000000..06c7bc1db5f
Binary files /dev/null and b/pkg/keccak/keccak_times4_linux_amd64.syso differ
diff --git a/pkg/keccak/keccak_times8_amd64.s b/pkg/keccak/keccak_times8_amd64.s
new file mode 100644
index 00000000000..8108924205a
--- /dev/null
+++ b/pkg/keccak/keccak_times8_amd64.s
@@ -0,0 +1,13 @@
+//go:build linux && amd64 && !purego
+
+#include "textflag.h"
+
+// func keccak256x8(inputs *[8][]byte, outputs *[8]Hash256)
+// Frame size 16384: AVX-512 state is larger (25 x 64 bytes = 1600 bytes) and
+// the permutation uses more stack. Generous headroom provided.
+TEXT ·keccak256x8(SB), $16384-16
+    MOVQ inputs+0(FP), DI
+    MOVQ outputs+8(FP), SI
+    CALL go_keccak256x8(SB)
+		VZEROUPPER
+    RET
diff --git a/pkg/keccak/keccak_times8_linux_amd64.syso b/pkg/keccak/keccak_times8_linux_amd64.syso
new file mode 100644
index 00000000000..5d9f9fa77c2
Binary files /dev/null and b/pkg/keccak/keccak_times8_linux_amd64.syso differ
diff --git a/pkg/storageincentives/proof.go b/pkg/storageincentives/proof.go
index 86dd115c67e..b26f5dc54a5 100644
--- a/pkg/storageincentives/proof.go
+++ b/pkg/storageincentives/proof.go
@@ -7,7 +7,6 @@ package storageincentives
 import (
 	"errors"
 	"fmt"
-	"hash"
 	"math/big"
 
 	"github.com/ethersphere/bee/v2/pkg/bmt"
@@ -55,10 +54,7 @@ func makeInclusionProofs(
 		require2++
 	}
 
-	prefixHasherFactory := func() hash.Hash {
-		return swarm.NewPrefixHasher(anchor1)
-	}
-	prefixHasherPool := bmt.NewPool(bmt.NewConf(prefixHasherFactory, swarm.BmtBranches, 8))
+	prefixHasherPool := bmt.NewPool(bmt.NewConfWithPrefix(anchor1, swarm.BmtBranches, 8))
 
 	// Sample chunk proofs
 	rccontent := bmt.Prover{Hasher: bmtpool.Get()}
diff --git a/pkg/storageincentives/soc_mine_test.go b/pkg/storageincentives/soc_mine_test.go
index 0265a9a21f7..b3d71077165 100644
--- a/pkg/storageincentives/soc_mine_test.go
+++ b/pkg/storageincentives/soc_mine_test.go
@@ -9,12 +9,10 @@ import (
 	"encoding/binary"
 	"encoding/hex"
 	"fmt"
-	"hash"
 	"math/big"
 	"os"
 	"sync"
 	"testing"
-	"testing/synctest"
 
 	"github.com/ethersphere/bee/v2/pkg/bmt"
 	"github.com/ethersphere/bee/v2/pkg/cac"
@@ -33,59 +31,55 @@ import (
 // to generate uploads using the input
 // cat socs.txt | tail 19 | head 16 | perl -pne 's/([a-f0-9]+)\t([a-f0-9]+)\t([a-f0-9]+)\t([a-f0-9]+)/echo -n $4 | xxd -r -p | curl -X POST \"http:\/\/localhost:1633\/soc\/$1\/$2?sig=$3\" -H \"accept: application\/json, text\/plain, \/\" -H \"content-type: application\/octet-stream\" -H \"swarm-postage-batch-id: 14b26beca257e763609143c6b04c2c487f01a051798c535c2f542ce75a97c05f\" --data-binary \@-/'
 func TestSocMine(t *testing.T) {
-	synctest.Test(t, func(t *testing.T) {
-		// the anchor used in neighbourhood selection and reserve salt for sampling
-		prefix, err := hex.DecodeString("3617319a054d772f909f7c479a2cebe5066e836a939412e32403c99029b92eff")
-		if err != nil {
-			t.Fatal(err)
-		}
-		// the transformed address hasher factory function
-		prefixhasher := func() hash.Hash { return swarm.NewPrefixHasher(prefix) }
-		// Create a pool for efficient hasher reuse
-		trHasherPool := bmt.NewPool(bmt.NewConf(prefixhasher, swarm.BmtBranches, 8))
-		// the bignum cast of the maximum sample value (upper bound on transformed addresses as a 256-bit article)
-		// this constant is for a minimum reserve size of 2 million chunks with sample size of 16
-		// = 1.284401 * 10^71 = 1284401 + 66 0-s
-		mstring := "1284401"
-		for range 66 {
-			mstring = mstring + "0"
-		}
-		n, ok := new(big.Int).SetString(mstring, 10)
-		if !ok {
-			t.Fatalf("SetString: error setting to '%s'", mstring)
-		}
-		// the filter function on the SOC address
-		// meant to make sure we pass check for proof of retrievability for
-		// a node of overlay 0x65xxx with a reserve depth of 1, i.e.,
-		// SOC address must start with zero bit
-		filterSOCAddr := func(a swarm.Address) bool {
-			return a.Bytes()[0]&0x80 != 0x00
-		}
-		// the filter function on the transformed address using the density estimation constant
-		filterTrAddr := func(a swarm.Address) (bool, error) {
-			m := new(big.Int).SetBytes(a.Bytes())
-			return m.Cmp(n) < 0, nil
-		}
-		// setup the signer with a private key from a fixture
-		data, err := hex.DecodeString("634fb5a872396d9693e5c9f9d7233cfa93f395c093371017ff44aa9ae6564cdd")
-		if err != nil {
-			t.Fatal(err)
-		}
-		privKey, err := crypto.DecodeSecp256k1PrivateKey(data)
-		if err != nil {
-			t.Fatal(err)
-		}
-		signer := crypto.NewDefaultSigner(privKey)
+	// the anchor used in neighbourhood selection and reserve salt for sampling
+	prefix, err := hex.DecodeString("3617319a054d772f909f7c479a2cebe5066e836a939412e32403c99029b92eff")
+	if err != nil {
+		t.Fatal(err)
+	}
+	// Create a pool for efficient hasher reuse
+	trHasherPool := bmt.NewPool(bmt.NewConfWithPrefix(prefix, swarm.BmtBranches, 8))
+	// the bignum cast of the maximum sample value (upper bound on transformed addresses as a 256-bit article)
+	// this constant is for a minimum reserve size of 2 million chunks with sample size of 16
+	// = 1.284401 * 10^71 = 1284401 + 66 0-s
+	mstring := "1284401"
+	for range 66 {
+		mstring = mstring + "0"
+	}
+	n, ok := new(big.Int).SetString(mstring, 10)
+	if !ok {
+		t.Fatalf("SetString: error setting to '%s'", mstring)
+	}
+	// the filter function on the SOC address
+	// meant to make sure we pass check for proof of retrievability for
+	// a node of overlay 0x65xxx with a reserve depth of 1, i.e.,
+	// SOC address must start with zero bit
+	filterSOCAddr := func(a swarm.Address) bool {
+		return a.Bytes()[0]&0x80 != 0x00
+	}
+	// the filter function on the transformed address using the density estimation constant
+	filterTrAddr := func(a swarm.Address) (bool, error) {
+		m := new(big.Int).SetBytes(a.Bytes())
+		return m.Cmp(n) < 0, nil
+	}
+	// setup the signer with a private key from a fixture
+	data, err := hex.DecodeString("634fb5a872396d9693e5c9f9d7233cfa93f395c093371017ff44aa9ae6564cdd")
+	if err != nil {
+		t.Fatal(err)
+	}
+	privKey, err := crypto.DecodeSecp256k1PrivateKey(data)
+	if err != nil {
+		t.Fatal(err)
+	}
+	signer := crypto.NewDefaultSigner(privKey)
 
-		sampleSize := 16
-		// for sanity check: given a filterSOCAddr requiring a 0 leading bit (chance of 1/2)
-		// we expect an overall rough 4 million chunks to be mined to create this sample
-		// for 8 workers that is half a million round on average per worker
-		err = makeChunks(t, signer, sampleSize, filterSOCAddr, filterTrAddr, trHasherPool)
-		if err != nil {
-			t.Fatal(err)
-		}
-	})
+	sampleSize := 16
+	// for sanity check: given a filterSOCAddr requiring a 0 leading bit (chance of 1/2)
+	// we expect an overall rough 4 million chunks to be mined to create this sample
+	// for 8 workers that is half a million round on average per worker
+	err = makeChunks(t, signer, sampleSize, filterSOCAddr, filterTrAddr, trHasherPool)
+	if err != nil {
+		t.Fatal(err)
+	}
 }
 
 func makeChunks(t *testing.T, signer crypto.Signer, sampleSize int, filterSOCAddr func(swarm.Address) bool, filterTrAddr func(swarm.Address) (bool, error), trHasherPool *bmt.Pool) error {
diff --git a/pkg/storer/sample.go b/pkg/storer/sample.go
index 43999429300..ae4b88c66a6 100644
--- a/pkg/storer/sample.go
+++ b/pkg/storer/sample.go
@@ -9,7 +9,6 @@ import (
 	"context"
 	"encoding/binary"
 	"fmt"
-	"hash"
 	"math/big"
 	"runtime"
 	"sort"
@@ -121,16 +120,12 @@ func (db *DB) ReserveSample(
 	// Phase 2: Get the chunk data and calculate transformed hash
 	sampleItemChan := make(chan SampleItem, 3*workers)
 
-	prefixHasherFactory := func() hash.Hash {
-		return swarm.NewPrefixHasher(anchor)
-	}
-
 	db.logger.Debug("reserve sampler workers", "count", workers)
 
 	for range workers {
 		g.Go(func() error {
 			wstat := SampleStats{}
-			hasher := bmt.NewHasher(prefixHasherFactory)
+			hasher := bmt.NewPrefixHasher(anchor)
 			defer func() {
 				addStats(wstat)
 			}()
@@ -407,12 +402,9 @@ func RandSample(t *testing.T, anchor []byte) Sample {
 
 // MakeSampleUsingChunks returns Sample constructed using supplied chunks.
 func MakeSampleUsingChunks(chunks []swarm.Chunk, anchor []byte) (Sample, error) {
-	prefixHasherFactory := func() hash.Hash {
-		return swarm.NewPrefixHasher(anchor)
-	}
 	items := make([]SampleItem, len(chunks))
 	for i, ch := range chunks {
-		tr, err := transformedAddress(bmt.NewHasher(prefixHasherFactory), ch, getChunkType(ch))
+		tr, err := transformedAddress(bmt.NewPrefixHasher(anchor), ch, getChunkType(ch))
 		if err != nil {
 			return Sample{}, err
 		}