diff --git a/go.mod b/go.mod
index f51b31e..dd619b3 100644
--- a/go.mod
+++ b/go.mod
@@ -3,6 +3,7 @@ module github.com/kelindar/tile
 go 1.23
 
 require (
+	github.com/kelindar/intmap v1.4.1
 	github.com/kelindar/iostream v1.4.0
 	github.com/stretchr/testify v1.9.0
 )
diff --git a/go.sum b/go.sum
index 9a32195..a913624 100644
--- a/go.sum
+++ b/go.sum
@@ -1,5 +1,7 @@
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/kelindar/intmap v1.4.1 h1:3jTPTrfNx4pxBPURR1+6f4YhbZS57CzsU0S9NEV51ZI=
+github.com/kelindar/intmap v1.4.1/go.mod h1:NkypxhfaklmDTJqwano3Q1BWk6je77qgQwszDwu8Kc8=
 github.com/kelindar/iostream v1.4.0 h1:ELKlinnM/K3GbRp9pYhWuZOyBxMMlYAfsOP+gauvZaY=
 github.com/kelindar/iostream v1.4.0/go.mod h1:MkjMuVb6zGdPQVdwLnFRO0xOTOdDvBWTztFmjRDQkXk=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
diff --git a/path.go b/path.go
index baf61ba..31b8f31 100644
--- a/path.go
+++ b/path.go
@@ -5,7 +5,10 @@ package tile
 
 import (
 	"math"
+	"math/bits"
 	"sync"
+
+	"github.com/kelindar/intmap"
 )
 
 type costFn = func(Value) uint16
@@ -25,19 +28,20 @@ func (m *Grid[T]) Around(from Point, distance uint32, costOf costFn, fn func(Poi
 
 	fn(from, start)
 
-	// Acquire a frontier heap for search
-	frontier := acquireHeap()
-	frontier.Push(from.Integer(), 0)
-	defer releaseHeap(frontier)
-
 	// For pre-allocating, we use πr2 since BFS will result in a approximation
 	// of a circle, in the worst case.
 	maxArea := int(math.Ceil(math.Pi * float64(distance*distance)))
-	reached := make(map[uint32]struct{}, maxArea)
-	reached[from.Integer()] = struct{}{}
 
+	// Acquire a frontier heap for search
+	state := acquire(maxArea)
+	frontier := state.frontier
+	reached := state.edges
+	defer release(state)
+
+	frontier.Push(from.Integer(), 0)
+	reached.Store(from.Integer(), 0)
 	for !frontier.IsEmpty() {
-		pCurr, _ := frontier.Pop()
+		pCurr := frontier.Pop()
 		current := unpackPoint(pCurr)
 
 		// Get all of the neighbors
@@ -52,9 +56,9 @@ func (m *Grid[T]) Around(from Point, distance uint32, costOf costFn, fn func(Poi
 
 			// Add to the search queue
 			pNext := next.Integer()
-			if _, ok := reached[pNext]; !ok {
+			if _, ok := reached.Load(pNext); !ok {
 				frontier.Push(pNext, 1)
-				reached[pNext] = struct{}{}
+				reached.Store(pNext, 1)
 				fn(next, nextTile)
 			}
 		})
@@ -63,177 +67,190 @@ func (m *Grid[T]) Around(from Point, distance uint32, costOf costFn, fn func(Poi
 
 // Path calculates a short path and the distance between the two locations
 func (m *Grid[T]) Path(from, to Point, costOf costFn) ([]Point, int, bool) {
-
-	// Acquire a frontier heap for search
-	frontier := acquireHeap()
-	frontier.Push(from.Integer(), 0)
-	defer releaseHeap(frontier)
+	distance := float64(from.DistanceTo(to))
+	maxArea := int(math.Ceil(math.Pi * float64(distance*distance)))
 
 	// For pre-allocating, we use πr2 since BFS will result in a approximation
 	// of a circle, in the worst case.
-	distance := float64(from.DistanceTo(to))
-	maxArea := int(math.Ceil(math.Pi * float64(distance*distance)))
-	edges := make(map[uint32]edge, maxArea)
-	edges[from.Integer()] = edge{
-		Point: from,
-		Cost:  0,
-	}
+	state := acquire(maxArea)
+	edges := state.edges
+	frontier := state.frontier
+	defer release(state)
+
+	frontier.Push(from.Integer(), 0)
+	edges.Store(from.Integer(), encode(0, Direction(0))) // Starting point has no direction
 
 	for !frontier.IsEmpty() {
-		pCurr, _ := frontier.Pop()
+		pCurr := frontier.Pop()
 		current := unpackPoint(pCurr)
 
-		// We have a path to the goal
+		// Decode the cost to reach the current point
+		currentEncoded, _ := edges.Load(pCurr)
+		currentCost, _ := decode(currentEncoded)
+
+		// Check if we've reached the destination
 		if current.Equal(to) {
-			dist := int(edges[current.Integer()].Cost)
-			path := make([]Point, 0, dist)
-			curr, _ := edges[current.Integer()]
-			for !curr.Point.Equal(from) {
-				path = append(path, curr.Point)
-				curr = edges[curr.Point.Integer()]
+
+			// Reconstruct the path
+			path := make([]Point, 0, 64)
+			path = append(path, current)
+			for !current.Equal(from) {
+				currentEncoded, _ := edges.Load(current.Integer())
+				_, dir := decode(currentEncoded)
+				current = current.Move(oppositeDirection(dir))
+				path = append(path, current)
+			}
+
+			// Reverse the path to get from source to destination
+			for i, j := 0, len(path)-1; i < j; i, j = i+1, j-1 {
+				path[i], path[j] = path[j], path[i]
 			}
 
-			return path, dist, true
+			return path, int(currentCost), true
 		}
 
-		// Get all of the neighbors
+		// Explore neighbors
 		m.Neighbors(current.X, current.Y, func(next Point, nextTile Tile[T]) {
 			cNext := costOf(nextTile.Value())
 			if cNext == 0 {
-				return // Blocked tile, ignore completely
+				return // Blocked tile
 			}
 
+			nextCost := currentCost + uint32(cNext)
 			pNext := next.Integer()
-			newCost := edges[pCurr].Cost + uint32(cNext) // cost(current, next)
 
-			if e, ok := edges[pNext]; !ok || newCost < e.Cost {
-				priority := newCost + next.DistanceTo(to) // heuristic
-				frontier.Push(next.Integer(), priority)
+			existingEncoded, visited := edges.Load(pNext)
+			existingCost, _ := decode(existingEncoded)
 
-				edges[pNext] = edge{
-					Point: current,
-					Cost:  newCost,
-				}
-			}
+			// If we haven't visited this node or we found a better path
+			if !visited || nextCost < existingCost {
+				angle := angleOf(current, next)
+				priority := nextCost + next.DistanceTo(to)
 
+				// Store the edge and push to the frontier
+				edges.Store(pNext, encode(nextCost, angle))
+				frontier.Push(pNext, priority)
+			}
 		})
 	}
 
 	return nil, 0, false
 }
 
-// -----------------------------------------------------------------------------
-
-var heapPool = sync.Pool{
-	New: func() interface{} { return new(heap32) },
-}
-
-// Acquires a new instance of a heap
-func acquireHeap() *heap32 {
-	h := heapPool.Get().(*heap32)
-	h.Reset()
-	return h
+// encode packs the cost and direction into a uint32
+func encode(cost uint32, dir Direction) uint32 {
+	return (cost << 4) | uint32(dir&0xF)
 }
 
-// Releases a heap instance back to the pool
-func releaseHeap(h *heap32) {
-	heapPool.Put(h)
+// decode unpacks the cost and direction from a uint32
+func decode(value uint32) (cost uint32, dir Direction) {
+	cost = value >> 4
+	dir = Direction(value & 0xF)
+	return
 }
 
 // -----------------------------------------------------------------------------
 
-// heapNode represents a ranked node for the heap.
-type heapNode struct {
-	Value uint32 // The value of the ranked node.
-	Rank  uint32 // The rank associated with the ranked node.
+type pathfinder struct {
+	edges    *intmap.Map
+	frontier *frontier
 }
 
-type heap32 []heapNode
-
-func newHeap32(capacity int) heap32 {
-	return make(heap32, 0, capacity)
+var pathfinders = sync.Pool{
+	New: func() any {
+		return &pathfinder{
+			edges:    intmap.New(32, .95),
+			frontier: newFrontier(),
+		}
+	},
 }
 
-// Reset clears the heap for reuse
-func (h *heap32) Reset() {
-	*h = (*h)[:0]
-}
+// Acquires a new instance of a pathfinding state
+func acquire(capacity int) *pathfinder {
+	v := pathfinders.Get().(*pathfinder)
+	if v.edges.Capacity() < capacity {
+		v.edges = intmap.New(capacity, .95)
+	}
 
-// Push pushes the element x onto the heap.
-// The complexity is O(log n) where n = h.Len().
-func (h *heap32) Push(v, rank uint32) {
-	*h = append(*h, heapNode{
-		Value: v,
-		Rank:  rank,
-	})
-	h.up(h.Len() - 1)
+	return v
 }
 
-// Pop removes and returns the minimum element (according to Less) from the heap.
-// The complexity is O(log n) where n = h.Len().
-// Pop is equivalent to Remove(h, 0).
-func (h *heap32) Pop() (uint32, bool) {
-	n := h.Len() - 1
-	if n < 0 {
-		return 0, false
-	}
-
-	h.Swap(0, n)
-	h.down(0, n)
-	return h.pop(), true
+// release releases a pathfinding state back to the pool
+func release(v *pathfinder) {
+	v.edges.Clear()
+	v.frontier.Reset()
+	pathfinders.Put(v)
 }
 
-func (h *heap32) pop() uint32 {
-	old := *h
-	n := len(old)
-	no := old[n-1]
-	*h = old[0 : n-1]
-	return no.Value
+// -----------------------------------------------------------------------------
+
+// frontier is a priority queue implementation that uses buckets to store
+// elements. Original implementation by Iskander Sharipov (https://github.com/quasilyte/pathing)
+type frontier struct {
+	buckets [64][]uint32
+	mask    uint64
 }
 
-func (h *heap32) up(j int) {
-	for {
-		i := (j - 1) / 2 // parent
-		if i == j || !h.Less(j, i) {
-			break
-		}
-		h.Swap(i, j)
-		j = i
+// newFrontier creates a new frontier priority queue
+func newFrontier() *frontier {
+	h := &frontier{}
+	for i := range &h.buckets {
+		h.buckets[i] = make([]uint32, 0, 16)
 	}
+	return h
 }
 
-func (h *heap32) down(i0, n int) bool {
-	i := i0
-	for {
-		j1 := 2*i + 1
-		if j1 >= n || j1 < 0 { // j1 < 0 after int overflow
-			break
-		}
-		j := j1 // left child
-		if j2 := j1 + 1; j2 < n && h.Less(j2, j1) {
-			j = j2 // = 2*i + 2  // right child
+func (q *frontier) Reset() {
+	buckets := &q.buckets
+
+	// Reslice storage slices back.
+	// To avoid traversing all len(q.buckets),
+	// we have some offset to skip uninteresting (already empty) buckets.
+	// We also stop when mask is 0 meaning all remaining buckets are empty too.
+	// In other words, it would only touch slices between min and max non-empty priorities.
+	mask := q.mask
+	offset := uint(bits.TrailingZeros64(mask))
+	mask >>= offset
+	i := offset
+	for mask != 0 {
+		if i < uint(len(buckets)) {
+			buckets[i] = buckets[i][:0]
 		}
-		if !h.Less(j, i) {
-			break
-		}
-		h.Swap(i, j)
-		i = j
+		mask >>= 1
+		i++
 	}
-	return i > i0
-}
 
-func (h heap32) Len() int {
-	return len(h)
+	q.mask = 0
 }
 
-func (h heap32) IsEmpty() bool {
-	return len(h) == 0
+func (q *frontier) IsEmpty() bool {
+	return q.mask == 0
 }
 
-func (h heap32) Less(i, j int) bool {
-	return h[i].Rank < h[j].Rank
+func (q *frontier) Push(value, priority uint32) {
+	// No bound checks since compiler knows that i will never exceed 64.
+	// We also get a cool truncation of values above 64 to store them
+	// in our biggest bucket.
+	i := priority & 0b111111
+	q.buckets[i] = append(q.buckets[i], value)
+	q.mask |= 1 << i
 }
 
-func (h *heap32) Swap(i, j int) {
-	(*h)[i], (*h)[j] = (*h)[j], (*h)[i]
+func (q *frontier) Pop() uint32 {
+	buckets := &q.buckets
+
+	// Using uints here and explicit len check to avoid the
+	// implicitly inserted bound check.
+	i := uint(bits.TrailingZeros64(q.mask))
+	if i < uint(len(buckets)) {
+		e := buckets[i][len(buckets[i])-1]
+		buckets[i] = buckets[i][:len(buckets[i])-1]
+		if len(buckets[i]) == 0 {
+			q.mask &^= 1 << i
+		}
+		return e
+	}
+
+	// A queue is empty
+	return 0
 }
diff --git a/path_test.go b/path_test.go
index c4135bd..c9b3e62 100644
--- a/path_test.go
+++ b/path_test.go
@@ -4,6 +4,7 @@
 package tile
 
 import (
+	"fmt"
 	"image"
 	"image/color"
 	"image/png"
@@ -19,14 +20,16 @@ func TestPath(t *testing.T) {
 	path, dist, found := m.Path(At(1, 1), At(7, 7), costOf)
 	assert.Equal(t, `
 .........
-. x .   .
-. x... ..
-. xxx. ..
-... x.  .
-.   xx  .
+.x  .   .
+.x ... ..
+.xxx . ..
+...x .  .
+.  xxx  .
 .....x...
-.    xx .
+.    xxx.
 .........`, plotPath(m, path))
+
+	fmt.Println(plotPath(m, path))
 	assert.Equal(t, 12, dist)
 	assert.True(t, found)
 }
@@ -35,12 +38,12 @@ func TestPathTiny(t *testing.T) {
 	m := NewGrid(6, 6)
 	path, dist, found := m.Path(At(0, 0), At(5, 5), costOf)
 	assert.Equal(t, `
- x    
- x    
- x    
- x    
- x    
- xxxx `, plotPath(m, path))
+x     
+x     
+x     
+x     
+x     
+xxxxxx`, plotPath(m, path))
 	assert.Equal(t, 10, dist)
 	assert.True(t, found)
 }
@@ -51,12 +54,15 @@ func TestDraw(t *testing.T) {
 	assert.NotNil(t, out)
 }
 
-// BenchmarkPath/9x9-8         	  210472	      5316 ns/op	   16468 B/op	       3 allocs/op
-// BenchmarkPath/300x300-8     	     463	   2546373 ns/op	 7801135 B/op	       4 allocs/op
-// BenchmarkPath/381x381-8     	     373	   2732657 ns/op	62394362 B/op	       4 allocs/op
-// BenchmarkPath/384x384-8     	     153	   7791925 ns/op	62396304 B/op	       5 allocs/op
-// BenchmarkPath/6144x6144-8   	     158	   7468206 ns/op	62395377 B/op	       3 allocs/op
-// BenchmarkPath/6147x6147-8   	     160	   7468716 ns/op	62395359 B/op	       3 allocs/op
+/*
+BenchmarkPath/9x9-24         	 2704395	       440.4 ns/op	     256 B/op	       1 allocs/op
+BenchmarkPath/300x300-24     	    1134	   1033808 ns/op	    3845 B/op	       4 allocs/op
+BenchmarkPath/381x381-24     	    2782	    377676 ns/op	    7298 B/op	       5 allocs/op
+BenchmarkPath/384x384-24     	    2716	    382663 ns/op	    7298 B/op	       5 allocs/op
+BenchmarkPath/3069x3069-24   	     847	   1368243 ns/op	  100140 B/op	       7 allocs/op
+BenchmarkPath/3072x3072-24   	     849	   1368387 ns/op	   99954 B/op	       7 allocs/op
+BenchmarkPath/6144x6144-24   	    3050	    387195 ns/op	   12802 B/op	       5 allocs/op
+*/
 func BenchmarkPath(b *testing.B) {
 	b.Run("9x9", func(b *testing.B) {
 		m := mapFrom("9x9.png")
@@ -122,9 +128,12 @@ func BenchmarkPath(b *testing.B) {
 	})
 }
 
-// BenchmarkAround/3r-8         	  352876	      3355 ns/op	     385 B/op	       1 allocs/op
-// BenchmarkAround/5r-8         	  162103	      7551 ns/op	     931 B/op	       2 allocs/op
-// BenchmarkAround/10r-8        	   62491	     19235 ns/op	    3489 B/op	       2 allocs/op
+/*
+cpu: 13th Gen Intel(R) Core(TM) i7-13700K
+BenchmarkAround/3r-24 	 2080566	     562.7 ns/op	       0 B/op	       0 allocs/op
+BenchmarkAround/5r-24 	  885582	      1358 ns/op	       0 B/op	       0 allocs/op
+BenchmarkAround/10r-24    300672	      3953 ns/op	       0 B/op	       0 allocs/op
+*/
 func BenchmarkAround(b *testing.B) {
 	m := mapFrom("300x300.png")
 	b.Run("3r", func(b *testing.B) {
@@ -175,10 +184,13 @@ func TestAroundMiss(t *testing.T) {
 	})
 }
 
-// BenchmarkHeap-8   	   94454	     12303 ns/op	    3968 B/op	       5 allocs/op
+/*
+cpu: 13th Gen Intel(R) Core(TM) i7-13700K
+BenchmarkHeap-24    	  240228	      5076 ns/op	    6016 B/op	      68 allocs/op
+*/
 func BenchmarkHeap(b *testing.B) {
 	for i := 0; i < b.N; i++ {
-		h := newHeap32(16)
+		h := newFrontier()
 		for j := 0; j < 128; j++ {
 			h.Push(rand(j), 1)
 		}
@@ -189,28 +201,6 @@ func BenchmarkHeap(b *testing.B) {
 	}
 }
 
-func TestHeap(t *testing.T) {
-	h := newHeap32(16)
-	h.Push(1, 0)
-	h.Pop()
-}
-
-func TestNewHeap(t *testing.T) {
-	h := newHeap32(16)
-	for j := 0; j < 8; j++ {
-		h.Push(rand(j), uint32(j))
-	}
-
-	val, _ := h.Pop()
-	for j := 1; j < 128; j++ {
-		newval, ok := h.Pop()
-		if ok {
-			assert.True(t, val < newval)
-			val = newval
-		}
-	}
-}
-
 // very fast semi-random function
 func rand(i int) uint32 {
 	i = i + 10000
diff --git a/point.go b/point.go
index d8a900b..eb34f1e 100644
--- a/point.go
+++ b/point.go
@@ -269,3 +269,35 @@ func (v Direction) String() string {
 func (v Direction) Vector(scale int16) Point {
 	return Point{}.MoveBy(v, scale)
 }
+
+// angleOf returns the direction from one point to another
+func angleOf(from, to Point) Direction {
+	dx := to.X - from.X
+	dy := to.Y - from.Y
+
+	switch {
+	case dx == 0 && dy == -1:
+		return North
+	case dx == 1 && dy == -1:
+		return NorthEast
+	case dx == 1 && dy == 0:
+		return East
+	case dx == 1 && dy == 1:
+		return SouthEast
+	case dx == 0 && dy == 1:
+		return South
+	case dx == -1 && dy == 1:
+		return SouthWest
+	case dx == -1 && dy == 0:
+		return West
+	case dx == -1 && dy == -1:
+		return NorthWest
+	default:
+		return Direction(0) // Invalid direction
+	}
+}
+
+// oppositeDirection returns the opposite of the given direction
+func oppositeDirection(dir Direction) Direction {
+	return Direction((dir + 4) % 8)
+}