From 185b47f19f59121906143cdb1eaeb9912667038d Mon Sep 17 00:00:00 2001 From: Brian McGee Date: Sat, 13 Jul 2024 15:55:25 +0100 Subject: [PATCH] feat: refactor cache usage Signed-off-by: Brian McGee --- cache/cache.go | 354 ++++++++++++---------------------- cli/cli.go | 4 + cli/format.go | 93 +++++---- cli/format_test.go | 6 +- format/cache.go | 93 +++++++++ go.sum | 20 +- test/examples/ruby/bundler.rb | 4 +- walker/filesystem.go | 77 +++++++- walker/filesystem_test.go | 10 +- walker/git.go | 31 ++- walker/walker.go | 22 ++- 11 files changed, 407 insertions(+), 307 deletions(-) create mode 100644 format/cache.go diff --git a/cache/cache.go b/cache/cache.go index 3d9b3c59..e1ae51a1 100644 --- a/cache/cache.go +++ b/cache/cache.go @@ -1,293 +1,193 @@ package cache import ( - "context" "crypto/sha1" "encoding/hex" "fmt" "os" - "runtime" + "strings" "time" - "git.numtide.com/numtide/treefmt/stats" - - "git.numtide.com/numtide/treefmt/format" - "git.numtide.com/numtide/treefmt/walker" - - "github.com/charmbracelet/log" - "github.com/adrg/xdg" + "github.com/charmbracelet/log" "github.com/vmihailenco/msgpack/v5" bolt "go.etcd.io/bbolt" ) const ( - pathsBucket = "paths" - formattersBucket = "formatters" + bucketPaths = "paths" + bucketFormatters = "formatters" + + noCacheDbPattern = "treefmt-no-cache-*.db" ) -// Entry represents a cache entry, indicating the last size and modified time for a file path. type Entry struct { Size int64 Modified time.Time } -var ( - db *bolt.DB - logger *log.Logger - - ReadBatchSize = 1024 * runtime.NumCPU() -) - -// Open creates an instance of bolt.DB for a given treeRoot path. -// If clean is true, Open will delete any existing data in the cache. -// -// The database will be located in `XDG_CACHE_DIR/treefmt/eval-cache/.db`, where is determined by hashing -// the treeRoot path. This associates a given treeRoot with a given instance of the cache. -func Open(treeRoot string, clean bool, formatters map[string]*format.Formatter) (err error) { - logger = log.WithPrefix("cache") - - // determine a unique and consistent db name for the tree root - h := sha1.New() - h.Write([]byte(treeRoot)) - digest := h.Sum(nil) +type Bucket[V any] struct { + bucket *bolt.Bucket +} - name := hex.EncodeToString(digest) - path, err := xdg.CacheFile(fmt.Sprintf("treefmt/eval-cache/%v.db", name)) - if err != nil { - return fmt.Errorf("could not resolve local path for the cache: %w", err) +func (b *Bucket[V]) Get(key string) (*V, error) { + bytes := b.bucket.Get([]byte(key)) + if bytes == nil { + return nil, nil + } + var value V + if err := msgpack.Unmarshal(bytes, &value); err != nil { + return nil, fmt.Errorf("failed to unmarshal cache entry for key '%v': %w", key, err) } + return &value, nil +} - db, err = bolt.Open(path, 0o600, nil) - if err != nil { - return fmt.Errorf("failed to open cache at %v: %w", path, err) +func (b *Bucket[V]) Put(key string, value *V) error { + if bytes, err := msgpack.Marshal(value); err != nil { + return fmt.Errorf("failed to marshal cache entry for key %v: %w", key, err) + } else if err = b.bucket.Put([]byte(key), bytes); err != nil { + return fmt.Errorf("failed to put cache entry for key %v: %w", key, err) } + return nil +} - err = db.Update(func(tx *bolt.Tx) error { - // create bucket for tracking paths - pathsBucket, err := tx.CreateBucketIfNotExists([]byte(pathsBucket)) - if err != nil { - return fmt.Errorf("failed to create paths bucket: %w", err) - } +func (b *Bucket[V]) Delete(key string) error { + return b.bucket.Delete([]byte(key)) +} - // create bucket for tracking formatters - formattersBucket, err := tx.CreateBucketIfNotExists([]byte(formattersBucket)) - if err != nil { - return fmt.Errorf("failed to create formatters bucket: %w", err) +func (b *Bucket[V]) DeleteAll() error { + c := b.bucket.Cursor() + for k, v := c.First(); !(k == nil && v == nil); k, v = c.Next() { + if err := c.Delete(); err != nil { + return fmt.Errorf("failed to remove cache entry for key %s: %w", string(k), err) } + } + return nil +} - // check for any newly configured or modified formatters - for name, formatter := range formatters { - - stat, err := os.Lstat(formatter.Executable()) - if err != nil { - return fmt.Errorf("failed to stat formatter executable %v: %w", formatter.Executable(), err) - } - - entry, err := getEntry(formattersBucket, name) - if err != nil { - return fmt.Errorf("failed to retrieve cache entry for formatter %v: %w", name, err) - } - - isNew := entry == nil - hasChanged := entry != nil && !(entry.Size == stat.Size() && entry.Modified == stat.ModTime()) - - if isNew { - logger.Debugf("formatter '%s' is new", name) - } else if hasChanged { - logger.Debug("formatter '%s' has changed", - name, - "size", stat.Size(), - "modTime", stat.ModTime(), - "cachedSize", entry.Size, - "cachedModTime", entry.Modified, - ) - } - - // update overall clean flag - clean = clean || isNew || hasChanged - - // record formatters info - entry = &Entry{ - Size: stat.Size(), - Modified: stat.ModTime(), - } - - if err = putEntry(formattersBucket, name, entry); err != nil { - return fmt.Errorf("failed to write cache entry for formatter %v: %w", name, err) - } +func (b *Bucket[V]) ForEach(f func(string, *V) error) error { + return b.bucket.ForEach(func(key, bytes []byte) error { + var value V + if err := msgpack.Unmarshal(bytes, &value); err != nil { + return fmt.Errorf("failed to unmarshal cache entry for key '%v': %w", key, err) } + return f(string(key), &value) + }) +} - // check for any removed formatters - if err = formattersBucket.ForEach(func(key []byte, _ []byte) error { - _, ok := formatters[string(key)] - if !ok { - // remove the formatter entry from the cache - if err = formattersBucket.Delete(key); err != nil { - return fmt.Errorf("failed to remove cache entry for formatter %v: %w", key, err) - } - // indicate a clean is required - clean = true - } - return nil - }); err != nil { - return fmt.Errorf("failed to check cache for removed formatters: %w", err) - } +type Tx struct { + tx *bolt.Tx +} - if clean { - // remove all path entries - c := pathsBucket.Cursor() - for k, v := c.First(); !(k == nil && v == nil); k, v = c.Next() { - if err = c.Delete(); err != nil { - return fmt.Errorf("failed to remove path entry: %w", err) - } - } - } +func (t *Tx) Commit() error { + return t.tx.Commit() +} - return nil - }) +func (t *Tx) Rollback() error { + return t.tx.Rollback() +} - return +func (t *Tx) Paths() (*Bucket[Entry], error) { + return t.cacheBucket(bucketPaths) } -// Close closes any open instance of the cache. -func Close() error { - if db == nil { - return nil - } - return db.Close() +func (t *Tx) Formatters() (*Bucket[Entry], error) { + return t.cacheBucket(bucketFormatters) } -// getEntry is a helper for reading cache entries from bolt. -func getEntry(bucket *bolt.Bucket, path string) (*Entry, error) { - b := bucket.Get([]byte(path)) - if b != nil { - var cached Entry - if err := msgpack.Unmarshal(b, &cached); err != nil { - return nil, fmt.Errorf("failed to unmarshal cache info for path '%v': %w", path, err) - } - return &cached, nil +func (t *Tx) cacheBucket(name string) (*Bucket[Entry], error) { + var b *bolt.Bucket + var err error + if t.tx.Writable() { + b, err = t.tx.CreateBucketIfNotExists([]byte(name)) } else { - return nil, nil + b = t.tx.Bucket([]byte(name)) } -} - -// putEntry is a helper for writing cache entries into bolt. -func putEntry(bucket *bolt.Bucket, path string, entry *Entry) error { - bytes, err := msgpack.Marshal(entry) if err != nil { - return fmt.Errorf("failed to marshal cache path %v: %w", path, err) + return nil, fmt.Errorf("failed to get/create bucket %s: %w", bucketPaths, err) } + return &Bucket[Entry]{b}, nil +} - if err = bucket.Put([]byte(path), bytes); err != nil { - return fmt.Errorf("failed to put cache path %v: %w", path, err) - } - return nil +type Cache struct { + db *bolt.DB + Temporary bool } -// ChangeSet is used to walk a filesystem, starting at root, and outputting any new or changed paths using pathsCh. -// It determines if a path is new or has changed by comparing against cache entries. -func ChangeSet(ctx context.Context, wk walker.Walker, filesCh chan<- *walker.File) error { - start := time.Now() +func (c *Cache) BeginTx(writeable bool) (*Tx, error) { + tx, err := c.db.Begin(writeable) + return &Tx{tx}, err +} - defer func() { - logger.Debugf("finished generating change set in %v", time.Since(start)) - }() +func (c *Cache) View(f func(*Tx) error) error { + return c.db.View(func(tx *bolt.Tx) error { + return f(&Tx{tx}) + }) +} - var tx *bolt.Tx - var bucket *bolt.Bucket - var processed int +func (c *Cache) Update(f func(*Tx) error) error { + return c.db.Update(func(tx *bolt.Tx) error { + return f(&Tx{tx}) + }) +} +func (c *Cache) Close() error { + path := c.db.Path() defer func() { - // close any pending read tx - if tx != nil { - _ = tx.Rollback() - } - }() - - return wk.Walk(ctx, func(file *walker.File, err error) error { - select { - case <-ctx.Done(): - return ctx.Err() - default: - if err != nil { - return fmt.Errorf("failed to walk path: %w", err) - } else if file.Info.IsDir() { - // ignore directories - return nil - } - } - - // open a new read tx if there isn't one in progress - // we have to periodically open a new read tx to prevent writes from being blocked - if tx == nil { - tx, err = db.Begin(false) - if err != nil { - return fmt.Errorf("failed to open a new cache read tx: %w", err) + if c.Temporary { + if err := os.Remove(path); err != nil { + log.Errorf("failed to remove temporary cache file %s: %v", path, err) } - bucket = tx.Bucket([]byte(pathsBucket)) - } - - cached, err := getEntry(bucket, file.RelPath) - if err != nil { - return err - } - - changedOrNew := cached == nil || !(cached.Modified == file.Info.ModTime() && cached.Size == file.Info.Size()) - - stats.Add(stats.Traversed, 1) - if !changedOrNew { - // no change - return nil + log.Debugf("successfully removed temporary cache file %s", path) } + }() + return c.db.Close() +} - stats.Add(stats.Emitted, 1) +func Open(path string) (*Cache, error) { + // open db + db, err := bolt.Open(path, 0o600, nil) + if err != nil { + return nil, err + } - // pass on the path - select { - case <-ctx.Done(): - return ctx.Err() - default: - filesCh <- file - } + c := &Cache{ + db: db, + Temporary: strings.Contains(path, "treefmt-no-cache-"), + } - // close the current tx if we have reached the batch size - processed += 1 - if processed == ReadBatchSize { - err = tx.Rollback() - tx = nil + // force creation of buckets if they don't already exist + return c, c.Update(func(tx *Tx) error { + if _, err := tx.Paths(); err != nil { return err } - - return nil + _, err = tx.Formatters() + return err }) } -// Update is used to record updated cache information for the specified list of paths. -func Update(files []*walker.File) error { - start := time.Now() - defer func() { - logger.Debugf("finished processing %v paths in %v", len(files), time.Since(start)) - }() - - if len(files) == 0 { - return nil +func Path(treeRoot string, noCache bool) (string, error) { + // if no cache is desired, rather than complicate the logic for cache / no cache, we just create a cache instance + // backed by a temporary file instead + if noCache { + if f, err := os.CreateTemp("", "treefmt-no-cache-*.db"); err != nil { + return "", fmt.Errorf("failed to create a temporary db file: %w", err) + } else { + return f.Name(), nil + } } - return db.Update(func(tx *bolt.Tx) error { - bucket := tx.Bucket([]byte(pathsBucket)) + // determine a unique and consistent db name based on the tree root + h := sha1.New() + h.Write([]byte(treeRoot)) + digest := h.Sum(nil) - for _, f := range files { - entry := Entry{ - Size: f.Info.Size(), - Modified: f.Info.ModTime(), - } + name := hex.EncodeToString(digest) - if err := putEntry(bucket, f.RelPath, &entry); err != nil { - return err - } - } + path, err := xdg.CacheFile(fmt.Sprintf("treefmt/eval-cache/%v.db", name)) + if err != nil { + err = fmt.Errorf("could not resolve local path for the cache: %w", err) + } - return nil - }) + return path, err } diff --git a/cli/cli.go b/cli/cli.go index 655e670a..fd3a00c9 100644 --- a/cli/cli.go +++ b/cli/cli.go @@ -1,6 +1,7 @@ package cli import ( + "git.numtide.com/numtide/treefmt/cache" "os" "github.com/gobwas/glob" @@ -41,6 +42,9 @@ type Format struct { formatters map[string]*format.Formatter globalExcludes []glob.Glob + cache *cache.Cache + + walker walker.Walker fileCh chan *walker.File formattedCh chan *walker.File processedCh chan *walker.File diff --git a/cli/format.go b/cli/format.go index 34cbfdb4..d55aa6b8 100644 --- a/cli/format.go +++ b/cli/format.go @@ -4,6 +4,7 @@ import ( "context" "errors" "fmt" + "git.numtide.com/numtide/treefmt/cache" "io" "os" "os/signal" @@ -16,7 +17,6 @@ import ( "git.numtide.com/numtide/treefmt/format" "git.numtide.com/numtide/treefmt/stats" - "git.numtide.com/numtide/treefmt/cache" "git.numtide.com/numtide/treefmt/config" "git.numtide.com/numtide/treefmt/walker" @@ -31,6 +31,7 @@ const ( var ErrFailOnChange = errors.New("unexpected changes detected, --fail-on-change is enabled") func (f *Format) Run() (err error) { + // set log level and other options f.configureLogging() @@ -53,13 +54,6 @@ func (f *Format) Run() (err error) { // create a prefixed logger log.SetPrefix("format") - // ensure cache is closed on return - defer func() { - if err := cache.Close(); err != nil { - log.Errorf("failed to close cache: %v", err) - } - }() - // find the config file unless specified if f.ConfigFile == "" { pwd, err := os.Getwd() @@ -119,12 +113,40 @@ func (f *Format) Run() (err error) { f.formatters[name] = formatter } - // open the cache if configured - if !f.NoCache { - if err = cache.Open(f.TreeRoot, f.ClearCache, f.formatters); err != nil { - // if we can't open the cache, we log a warning and fallback to no cache - log.Warnf("failed to open cache: %v", err) - f.NoCache = true + // initialise the cache + cachePath, err := cache.Path(f.TreeRoot, f.NoCache) + if err != nil { + return err + } else if f.cache, err = cache.Open(cachePath); err != nil { + return fmt.Errorf("failed to open cache") + } + log.Debugf("opened cache @ %s", cachePath) + + // ensure the cache is closed on shutdown + defer func() { + if err := f.cache.Close(); err != nil { + log.Errorf("failed to close cache: %v", err) + } + log.Debug("successfully closed cache") + }() + + // check if the formatters set has changed in any meaningful way + if err = format.CheckFormatters(f.cache, f.formatters); err != nil { + return fmt.Errorf("failed to check formatters: %w", err) + } + log.Debugf("formatters check complete") + + // clear out the paths bucket if desired before starting + if f.ClearCache { + err = f.cache.Update(func(tx *cache.Tx) error { + paths, err := tx.Paths() + if err != nil { + return err + } + return paths.DeleteAll() + }) + if err != nil { + return fmt.Errorf("failed to clear paths from cache: %w", err) } } @@ -222,7 +244,8 @@ func (f *Format) walkFilesystem(ctx context.Context) func() error { } // create a filesystem walker - wk, err := walker.New(walkerType, f.TreeRoot, f.NoCache, pathCh) + var err error + f.walker, err = walker.New(walkerType, f.TreeRoot, f.cache, pathCh) if err != nil { return fmt.Errorf("failed to create walker: %w", err) } @@ -230,27 +253,17 @@ func (f *Format) walkFilesystem(ctx context.Context) func() error { // close the file channel when we're done walking the file system defer close(f.fileCh) - // if no cache has been configured, or we are processing from stdin, we invoke the walker directly - if f.NoCache || f.Stdin { - return wk.Walk(ctx, func(file *walker.File, err error) error { - select { - case <-ctx.Done(): - return ctx.Err() - default: - stats.Add(stats.Traversed, 1) - stats.Add(stats.Emitted, 1) - f.fileCh <- file - return nil - } - }) - } - - // otherwise we pass the walker to the cache and have it generate files for processing based on whether or not - // they have been added/changed since the last invocation - if err = cache.ChangeSet(ctx, wk, f.fileCh); err != nil { - return fmt.Errorf("failed to generate change set: %w", err) - } - return nil + // + return f.walker.Walk(ctx, func(file *walker.File, err error) error { + select { + case <-ctx.Done(): + return ctx.Err() + default: + stats.Add(stats.Emitted, 1) + f.fileCh <- file + return nil + } + }) } } @@ -423,17 +436,17 @@ func (f *Format) updateCache(ctx context.Context) func() error { // apply a batch processBatch := func() error { - // pass the batch to the cache for updating - if err := cache.Update(batch); err != nil { + // let the walker record updated path info + if err := f.walker.UpdatePaths(batch); err != nil { return err } + // reset the batch batch = batch[:0] return nil } // if we are processing from stdin that means we are outputting to stdout, no caching involved - // if f.NoCache is set that means either the user explicitly disabled the cache or we failed to open on - if f.Stdin || f.NoCache { + if f.Stdin { // do nothing processBatch = func() error { return nil } } diff --git a/cli/format_test.go b/cli/format_test.go index 1d6d75dd..11a77e20 100644 --- a/cli/format_test.go +++ b/cli/format_test.go @@ -412,7 +412,7 @@ func TestBustCacheOnFormatterChange(t *testing.T) { } test.WriteConfig(t, configPath, cfg) - args := []string{"--config-file", configPath, "--tree-root", tempDir} + args := []string{"--config-file", configPath, "--tree-root", tempDir, "-vv"} _, err := cmd(t, args...) as.NoError(err) assertStats(t, as, 32, 32, 3, 0) @@ -425,8 +425,8 @@ func TestBustCacheOnFormatterChange(t *testing.T) { assertStats(t, as, 32, 32, 3, 0) // check cache is working - _, err = cmd(t, args...) - as.NoError(err) + out, err := cmd(t, args...) + as.NoError(err, string(out)) assertStats(t, as, 32, 0, 0, 0) // tweak mod time of python formatter diff --git a/format/cache.go b/format/cache.go new file mode 100644 index 00000000..51278015 --- /dev/null +++ b/format/cache.go @@ -0,0 +1,93 @@ +package format + +import ( + "fmt" + "git.numtide.com/numtide/treefmt/cache" + "github.com/charmbracelet/log" + "os" +) + +func CheckFormatters(c *cache.Cache, formatters map[string]*Formatter) error { + return c.Update(func(tx *cache.Tx) error { + + clearPaths := false + + pathsBucket, err := tx.Paths() + if err != nil { + return fmt.Errorf("failed to get paths bucket from cache: %w", err) + } + + formattersBucket, err := tx.Formatters() + if err != nil { + return fmt.Errorf("failed to get formatters bucket from cache: %w", err) + } + + // check for any newly configured or modified formatters + for name, formatter := range formatters { + + stat, err := os.Lstat(formatter.Executable()) + if err != nil { + return fmt.Errorf("failed to stat formatter executable %v: %w", formatter.Executable(), err) + } + + entry, err := formattersBucket.Get(name) + if err != nil { + return fmt.Errorf("failed to retrieve cache entry for formatter %v: %w", name, err) + } + + isNew := entry == nil + hasChanged := entry != nil && !(entry.Size == stat.Size() && entry.Modified == stat.ModTime()) + + if isNew { + log.Debugf("formatter '%s' is new", name) + } else if hasChanged { + log.Debug("formatter '%s' has changed", + name, + "size", stat.Size(), + "modTime", stat.ModTime(), + "cachedSize", entry.Size, + "cachedModTime", entry.Modified, + ) + } + + // update overall flag + clearPaths = clearPaths || isNew || hasChanged + + // record formatters info + entry = &cache.Entry{ + Size: stat.Size(), + Modified: stat.ModTime(), + } + + if err = formattersBucket.Put(name, entry); err != nil { + return fmt.Errorf("failed to write cache entry for formatter %v: %w", name, err) + } + } + + // check for any removed formatters + if err = formattersBucket.ForEach(func(key string, _ *cache.Entry) error { + _, ok := formatters[key] + if !ok { + // remove the formatter entry from the cache + if err = formattersBucket.Delete(key); err != nil { + return fmt.Errorf("failed to remove cache entry for formatter %v: %w", key, err) + } + // indicate a clean is required + clearPaths = true + } + return nil + }); err != nil { + return fmt.Errorf("failed to check cache for removed formatters: %w", err) + } + + if clearPaths { + // remove all path entries + if err := pathsBucket.DeleteAll(); err != nil { + return fmt.Errorf("failed to remove all path entries from cache: %w", err) + } + } + + return nil + }) + +} diff --git a/go.sum b/go.sum index 60fc1785..1f0c0f64 100644 --- a/go.sum +++ b/go.sum @@ -46,12 +46,8 @@ github.com/go-git/go-billy/v5 v5.5.0 h1:yEY4yhzCDuMGSv83oGxiBotRzhwhNr8VZyphhiu+ github.com/go-git/go-billy/v5 v5.5.0/go.mod h1:hmexnoNsr2SJU1Ju67OaNz5ASJY3+sHgFRpCtpDCKow= github.com/go-git/go-git-fixtures/v4 v4.3.2-0.20231010084843-55a94097c399 h1:eMje31YglSBqCdIqdhKBW8lokaMrL3uTkpGYlE2OOT4= github.com/go-git/go-git-fixtures/v4 v4.3.2-0.20231010084843-55a94097c399/go.mod h1:1OCfN199q1Jm3HZlxleg+Dw/mwps2Wbk9frAWm+4FII= -github.com/go-git/go-git/v5 v5.12.0 h1:7Md+ndsjrzZxbddRDZjF14qK+NN56sy6wkqaVrjZtys= -github.com/go-git/go-git/v5 v5.12.0/go.mod h1:FTM9VKtnI2m65hNI/TenDDDnUf2Q9FHnXYjuz9i5OEY= github.com/go-git/go-git/v5 v5.12.1-0.20240409060936-cd6633c3c665 h1:+M6D6MplKwRqmw8b41MArUGFsGTnl24+/S40I0yrhKs= github.com/go-git/go-git/v5 v5.12.1-0.20240409060936-cd6633c3c665/go.mod h1:QZbSbsaXQD7v0yvddAhVE2UfW5wGmeqHQ0UnOSr6JYQ= -github.com/go-git/go-git/v5 v5.12.1-0.20240516215126-9cc340a7fc5c h1:Pyuh3Y6kb/+k7yl5nLN6pcQ+7isiJ1PnBj3QSUH1hbQ= -github.com/go-git/go-git/v5 v5.12.1-0.20240516215126-9cc340a7fc5c/go.mod h1:Tzg+feu/PVazFrRFtWHxfvErT+p+Zo3Yg0nzEhJdxW4= github.com/go-logfmt/logfmt v0.6.0 h1:wGYYu3uicYdqXVgoYbvnkrPVXkuLM1p1ifugDMEdRi4= github.com/go-logfmt/logfmt v0.6.0/go.mod h1:WYhtIu8zTZfxdn5+rREduYbwxfcBr/Vr6KEVveWlfTs= github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= @@ -120,8 +116,6 @@ github.com/vmihailenco/tagparser/v2 v2.0.0/go.mod h1:Wri+At7QHww0WTrCBeu4J6bNtoV github.com/xanzy/ssh-agent v0.3.3 h1:+/15pJfg/RsTxqYcX6fHqOXZwwMP+2VyYWJeWM2qQFM= github.com/xanzy/ssh-agent v0.3.3/go.mod h1:6dzNDKs0J9rVPHPhaGCukekBHKqfl+L3KghI1Bc68Uw= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= -go.etcd.io/bbolt v1.3.9 h1:8x7aARPEXiXbHmtUwAIv7eV2fQFHrLLavdiJ3uzJXoI= -go.etcd.io/bbolt v1.3.9/go.mod h1:zaO32+Ti0PK1ivdPtgMESzuzL2VPoIG1PCQNvOdo/dE= go.etcd.io/bbolt v1.3.10 h1:+BqfJTcCzTItrop8mq/lbzL8wSGtj94UO/3U31shqG0= go.etcd.io/bbolt v1.3.10/go.mod h1:bK3UQLPJZly7IlNmV7uVHJDxfe5aK9Ll93e/74Y9oEQ= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= @@ -129,12 +123,8 @@ golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5y golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.3.1-0.20221117191849-2c476679df9a/go.mod h1:hebNnKkNXi2UzZN1eVRvBB7co0a+JxK6XbPiWVs/3J4= golang.org/x/crypto v0.7.0/go.mod h1:pYwdfH91IfpZVANVyUOhSIPZaFoJGxTFbZhFTx+dXZU= -golang.org/x/crypto v0.22.0 h1:g1v0xeRhjcugydODzvb3mEM9SQ0HGp9s/nh3COQ/C30= -golang.org/x/crypto v0.22.0/go.mod h1:vr6Su+7cTlO45qkww3VDJlzDn0ctJvRgYbC2NvXHt+M= golang.org/x/crypto v0.23.0 h1:dIJU/v2J8Mdglj/8rJ6UUOM3Zc9zLZxVZwwxMooUSAI= golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= -golang.org/x/exp v0.0.0-20240416160154-fe59bbe5cc7f h1:99ci1mjWVBWwJiEKYY6jWa4d2nTQVIEhZIptnrVb1XY= -golang.org/x/exp v0.0.0-20240416160154-fe59bbe5cc7f/go.mod h1:/lliqkxwWAhPjf5oSOIJup2XcqJaw8RGS6k3TGEc7GI= golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842 h1:vr/HnozRka3pE4EsMEg1lgkXJkTFJCVUX+S/ZT6wYzM= golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842/go.mod h1:XtvwrStGgqGPLc4cjQfWqZHG1YFdYs6swckp8vpsjnc= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= @@ -146,8 +136,6 @@ golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug golang.org/x/net v0.2.0/go.mod h1:KqCZLdyyvdV855qA2rE3GC2aiw5xGR5TEjj8smXukLY= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc= -golang.org/x/net v0.24.0 h1:1PcaxkF854Fu3+lvBIx5SYn9wRlBzzcnHZSiaFFAb0w= -golang.org/x/net v0.24.0/go.mod h1:2Q7sJY5mzlzWjKtYUEXSlBWCdyaioyXzRB2RtU8KVE8= golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac= golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -169,8 +157,6 @@ golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.3.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.19.0 h1:q5f1RH2jigJ1MoAWp2KTp3gm5zAGFUTarQZ5U386+4o= -golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y= golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= @@ -178,9 +164,8 @@ golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuX golang.org/x/term v0.2.0/go.mod h1:TVmDHMZPmdnySmBfhjOoOdhjzdE1h4u1VwSiw2l1Nuc= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U= -golang.org/x/term v0.19.0 h1:+ThwsDv+tYfnJFhF4L8jITxu1tdTWRTZpdsWgEgjL6Q= -golang.org/x/term v0.19.0/go.mod h1:2CuTdWZ7KHSQwUzKva0cbMg6q2DMI3Mmxp+gKJbskEk= golang.org/x/term v0.20.0 h1:VnkxpohqXaOBYJtBmEppKUG6mXpi+4O6purfc2+sMhw= +golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= @@ -188,9 +173,8 @@ golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= -golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= -golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk= +golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= diff --git a/test/examples/ruby/bundler.rb b/test/examples/ruby/bundler.rb index 6a3a14fc..9b177459 100644 --- a/test/examples/ruby/bundler.rb +++ b/test/examples/ruby/bundler.rb @@ -155,11 +155,11 @@ def specs_path end def cache - bundle_path.join("cache/bundler") + bundle_path.join("caching/bundler") end def user_cache - user_bundle_path.join("cache") + user_bundle_path.join("caching") end def root diff --git a/walker/filesystem.go b/walker/filesystem.go index 8a47c7af..a99d7252 100644 --- a/walker/filesystem.go +++ b/walker/filesystem.go @@ -3,17 +3,45 @@ package walker import ( "context" "fmt" + "git.numtide.com/numtide/treefmt/cache" + "git.numtide.com/numtide/treefmt/stats" "io/fs" "os" "path/filepath" + "runtime" ) type filesystemWalker struct { root string + cache *cache.Cache pathsCh chan string relPathOffset int } +func (f filesystemWalker) UpdatePaths(batch []*File) error { + if err := f.cache.Update(func(tx *cache.Tx) error { + // get the paths bucket + paths, err := tx.Paths() + if err != nil { + return err + } + for _, f := range batch { + entry := cache.Entry{ + Size: f.Info.Size(), + Modified: f.Info.ModTime(), + } + if err := paths.Put(f.RelPath, &entry); err != nil { + return err + } + } + + return nil + }); err != nil { + return fmt.Errorf("failed to update paths: %w", err) + } + return nil +} + func (f filesystemWalker) Root() string { return f.root } @@ -28,6 +56,19 @@ func (f filesystemWalker) relPath(path string) (string, error) { } func (f filesystemWalker) Walk(_ context.Context, fn WalkFunc) error { + + var tx *cache.Tx + var paths *cache.Bucket[cache.Entry] + var processed int + batchSize := 1024 * runtime.NumCPU() + + defer func() { + // close any pending read tx + if tx != nil { + _ = tx.Rollback() + } + }() + walkFn := func(path string, info fs.FileInfo, _ error) error { if info == nil { return fmt.Errorf("no such file or directory '%s'", path) @@ -48,6 +89,39 @@ func (f filesystemWalker) Walk(_ context.Context, fn WalkFunc) error { RelPath: relPath, Info: info, } + + // open a new read tx if there isn't one in progress + // we have to periodically open a new read tx to prevent writes from being blocked + if tx == nil { + if tx, err = f.cache.BeginTx(false); err != nil { + return fmt.Errorf("failed to open a new cache read tx: %w", err) + } else if paths, err = tx.Paths(); err != nil { + return fmt.Errorf("failed to get paths bucket from cache tx: %w", err) + } + } + + cached, err := paths.Get(file.RelPath) + if err != nil { + return err + } + + // close the current tx if we have reached the batch size + processed += 1 + if processed == batchSize { + if err = tx.Rollback(); err != nil { + return err + } + tx = nil + } + + // + changedOrNew := cached == nil || !(cached.Modified == file.Info.ModTime() && cached.Size == file.Info.Size()) + + stats.Add(stats.Traversed, 1) + if !changedOrNew { + // no change + return nil + } return fn(&file, err) } @@ -60,9 +134,10 @@ func (f filesystemWalker) Walk(_ context.Context, fn WalkFunc) error { return nil } -func NewFilesystem(root string, paths chan string) (Walker, error) { +func NewFilesystem(root string, cache *cache.Cache, paths chan string) (Walker, error) { return filesystemWalker{ root: root, + cache: cache, pathsCh: paths, relPathOffset: len(root) + 1, }, nil diff --git a/walker/filesystem_test.go b/walker/filesystem_test.go index 1edaa170..fcbe0f81 100644 --- a/walker/filesystem_test.go +++ b/walker/filesystem_test.go @@ -5,6 +5,9 @@ import ( "os" "testing" + "git.numtide.com/numtide/treefmt/cache" + "git.numtide.com/numtide/treefmt/stats" + "git.numtide.com/numtide/treefmt/test" "github.com/stretchr/testify/require" ) @@ -55,7 +58,12 @@ func TestFilesystemWalker_Walk(t *testing.T) { as := require.New(t) - walker, err := NewFilesystem(tempDir, paths) + stats.Init() + + c, err := cache.Open(t.TempDir() + "/test.db") + as.NoError(err, "failed to open cache") + + walker, err := NewFilesystem(tempDir, c, paths) as.NoError(err) idx := 0 diff --git a/walker/git.go b/walker/git.go index bbf30652..4b51c2e2 100644 --- a/walker/git.go +++ b/walker/git.go @@ -3,6 +3,8 @@ package walker import ( "context" "fmt" + "git.numtide.com/numtide/treefmt/cache" + "git.numtide.com/numtide/treefmt/stats" "io/fs" "os" "path/filepath" @@ -24,6 +26,11 @@ type gitWalker struct { relPathOffset int } +func (g gitWalker) UpdatePaths(_ []*File) error { + // nothing to do, git is doing all the tracking + return nil +} + func (g gitWalker) Root() string { return g.root } @@ -44,7 +51,7 @@ func (g gitWalker) Walk(ctx context.Context, fn WalkFunc) error { } // cache in-memory whether a path is present in the git index - var cache map[string]*index.Entry + var indexCache map[string]*index.Entry // by default, we only emit files if they have changes when compared with the git index emitFile := func(entry *index.Entry, info os.FileInfo) bool { @@ -98,6 +105,7 @@ func (g gitWalker) Walk(ctx context.Context, fn WalkFunc) error { Info: info, } + stats.Add(stats.Traversed, 1) if err = fn(&file, err); err != nil { return err } @@ -107,10 +115,10 @@ func (g gitWalker) Walk(ctx context.Context, fn WalkFunc) error { } // otherwise we ensure the git index entries are cached and then check if the path is in the git index - if cache == nil { - cache = make(map[string]*index.Entry) + if indexCache == nil { + indexCache = make(map[string]*index.Entry) for _, entry := range idx.Entries { - cache[entry.Name] = entry + indexCache[entry.Name] = entry } } @@ -119,7 +127,7 @@ func (g gitWalker) Walk(ctx context.Context, fn WalkFunc) error { return fmt.Errorf("failed to find relative path for %v: %w", path, err) } - _, ok := cache[relPath] + _, ok := indexCache[relPath] if !(path == g.root || ok) { log.Debugf("path %v not found in git index, skipping", path) continue @@ -136,7 +144,7 @@ func (g gitWalker) Walk(ctx context.Context, fn WalkFunc) error { return fmt.Errorf("failed to determine a relative path for %s: %w", path, err) } - if entry, ok := cache[relPath]; !ok { + if entry, ok := indexCache[relPath]; !ok { log.Debugf("path %v not found in git index, skipping", path) return nil } else if !emitFile(entry, info) { @@ -150,6 +158,7 @@ func (g gitWalker) Walk(ctx context.Context, fn WalkFunc) error { Info: info, } + stats.Add(stats.Traversed, 1) return fn(&file, err) }) } @@ -159,18 +168,24 @@ func (g gitWalker) Walk(ctx context.Context, fn WalkFunc) error { func NewGit( root string, - noCache bool, + cache *cache.Cache, paths chan string, ) (Walker, error) { repo, err := git.PlainOpen(root) if err != nil { return nil, fmt.Errorf("failed to open git repo: %w", err) } + + // immediately close the cache as we have no need for it + if err = cache.Close(); err != nil { + return nil, fmt.Errorf("failed to close cache: %w", err) + } + return &gitWalker{ root: root, paths: paths, repo: repo, - noCache: noCache, + noCache: cache.Temporary, relPathOffset: len(root) + 1, }, nil } diff --git a/walker/walker.go b/walker/walker.go index 270854cf..84d4b361 100644 --- a/walker/walker.go +++ b/walker/walker.go @@ -3,6 +3,7 @@ package walker import ( "context" "fmt" + "git.numtide.com/numtide/treefmt/cache" "io/fs" "os" "time" @@ -54,26 +55,33 @@ type WalkFunc func(file *File, err error) error type Walker interface { Root() string Walk(ctx context.Context, fn WalkFunc) error + UpdatePaths(batch []*File) error } -func New(walkerType Type, root string, noCache bool, pathsCh chan string) (Walker, error) { +func New( + walkerType Type, + root string, + cache *cache.Cache, + pathsCh chan string, +) (Walker, error) { + switch walkerType { case Git: - return NewGit(root, noCache, pathsCh) + return NewGit(root, cache, pathsCh) case Auto: - return Detect(root, noCache, pathsCh) + return Detect(root, cache, pathsCh) case Filesystem: - return NewFilesystem(root, pathsCh) + return NewFilesystem(root, cache, pathsCh) default: return nil, fmt.Errorf("unknown walker type: %v", walkerType) } } -func Detect(root string, noCache bool, pathsCh chan string) (Walker, error) { +func Detect(root string, cache *cache.Cache, pathsCh chan string) (Walker, error) { // for now, we keep it simple and try git first, filesystem second - w, err := NewGit(root, noCache, pathsCh) + w, err := NewGit(root, cache, pathsCh) if err == nil { return w, err } - return NewFilesystem(root, pathsCh) + return NewFilesystem(root, cache, pathsCh) }