From 9253e50dd4f609ae67e44ae89827542af752e726 Mon Sep 17 00:00:00 2001 From: Brian McGee Date: Tue, 9 Jul 2024 20:08:50 +0100 Subject: [PATCH 1/3] feat: only emit changed files with git walker If the modified time has not changed when compared with the git index we do not emit the file for processing. This allows users to introduce treefmt to a repository without suffering an initial large formatting commit. Instead, files can be formatted incrementally as they are changed. Closes #311 Signed-off-by: Brian McGee --- cache/cache.go | 8 ++--- cli/cli.go | 17 ++++----- cli/format.go | 36 +++++++++---------- format/formatter.go | 4 +-- format/task.go | 6 ++-- {walk => walker}/filesystem.go | 2 +- {walk => walker}/filesystem_test.go | 2 +- {walk => walker}/git.go | 54 +++++++++++++++++++++++------ {walk => walker}/walker.go | 12 +++---- 9 files changed, 87 insertions(+), 54 deletions(-) rename {walk => walker}/filesystem.go (98%) rename {walk => walker}/filesystem_test.go (99%) rename {walk => walker}/git.go (69%) rename {walk => walker}/walker.go (83%) diff --git a/cache/cache.go b/cache/cache.go index fe1c1ac2..3d9b3c59 100644 --- a/cache/cache.go +++ b/cache/cache.go @@ -12,7 +12,7 @@ import ( "git.numtide.com/numtide/treefmt/stats" "git.numtide.com/numtide/treefmt/format" - "git.numtide.com/numtide/treefmt/walk" + "git.numtide.com/numtide/treefmt/walker" "github.com/charmbracelet/log" @@ -187,7 +187,7 @@ func putEntry(bucket *bolt.Bucket, path string, entry *Entry) error { // ChangeSet is used to walk a filesystem, starting at root, and outputting any new or changed paths using pathsCh. // It determines if a path is new or has changed by comparing against cache entries. -func ChangeSet(ctx context.Context, walker walk.Walker, filesCh chan<- *walk.File) error { +func ChangeSet(ctx context.Context, wk walker.Walker, filesCh chan<- *walker.File) error { start := time.Now() defer func() { @@ -205,7 +205,7 @@ func ChangeSet(ctx context.Context, walker walk.Walker, filesCh chan<- *walk.Fil } }() - return walker.Walk(ctx, func(file *walk.File, err error) error { + return wk.Walk(ctx, func(file *walker.File, err error) error { select { case <-ctx.Done(): return ctx.Err() @@ -264,7 +264,7 @@ func ChangeSet(ctx context.Context, walker walk.Walker, filesCh chan<- *walk.Fil } // Update is used to record updated cache information for the specified list of paths. -func Update(files []*walk.File) error { +func Update(files []*walker.File) error { start := time.Now() defer func() { logger.Debugf("finished processing %v paths in %v", len(files), time.Since(start)) diff --git a/cli/cli.go b/cli/cli.go index 0d56cafb..655e670a 100644 --- a/cli/cli.go +++ b/cli/cli.go @@ -6,7 +6,7 @@ import ( "github.com/gobwas/glob" "git.numtide.com/numtide/treefmt/format" - "git.numtide.com/numtide/treefmt/walk" + "git.numtide.com/numtide/treefmt/walker" "github.com/alecthomas/kong" "github.com/charmbracelet/log" ) @@ -25,10 +25,11 @@ type Format struct { Formatters []string `short:"f" help:"Specify formatters to apply. Defaults to all formatters."` TreeRoot string `type:"existingdir" xor:"tree-root" help:"The root directory from which treefmt will start walking the filesystem (defaults to the directory containing the config file)."` TreeRootFile string `type:"string" xor:"tree-root" help:"File to search for to find the project root (if --tree-root is not passed)."` - Walk walk.Type `enum:"auto,git,filesystem" default:"auto" help:"The method used to traverse the files within --tree-root. Currently supports 'auto', 'git' or 'filesystem'."` - Verbosity int `name:"verbose" short:"v" type:"counter" default:"0" env:"LOG_LEVEL" help:"Set the verbosity of logs e.g. -vv."` - Version bool `name:"version" short:"V" help:"Print version."` - Init bool `name:"init" short:"i" help:"Create a new treefmt.toml."` + Walk walker.Type `enum:"auto,git,filesystem" default:"auto" help:"The method used to traverse the files within --tree-root. Currently supports 'auto', 'git' or 'filesystem'."` + + Verbosity int `name:"verbose" short:"v" type:"counter" default:"0" env:"LOG_LEVEL" help:"Set the verbosity of logs e.g. -vv."` + Version bool `name:"version" short:"V" help:"Print version."` + Init bool `name:"init" short:"i" help:"Create a new treefmt.toml."` OnUnmatched log.Level `name:"on-unmatched" short:"u" default:"warn" help:"Log paths that did not match any formatters at the specified log level, with fatal exiting the process with an error. Possible values are ."` @@ -40,9 +41,9 @@ type Format struct { formatters map[string]*format.Formatter globalExcludes []glob.Glob - filesCh chan *walk.File - formattedCh chan *walk.File - processedCh chan *walk.File + fileCh chan *walker.File + formattedCh chan *walker.File + processedCh chan *walker.File } func (f *Format) configureLogging() { diff --git a/cli/format.go b/cli/format.go index 95ff37bc..34cbfdb4 100644 --- a/cli/format.go +++ b/cli/format.go @@ -18,7 +18,7 @@ import ( "git.numtide.com/numtide/treefmt/cache" "git.numtide.com/numtide/treefmt/config" - "git.numtide.com/numtide/treefmt/walk" + "git.numtide.com/numtide/treefmt/walker" "github.com/charmbracelet/log" "golang.org/x/sync/errgroup" @@ -147,13 +147,13 @@ func (f *Format) Run() (err error) { // create a channel for files needing to be processed // we use a multiple of batch size here as a rudimentary concurrency optimization based on the host machine - f.filesCh = make(chan *walk.File, BatchSize*runtime.NumCPU()) + f.fileCh = make(chan *walker.File, BatchSize*runtime.NumCPU()) // create a channel for files that have been formatted - f.formattedCh = make(chan *walk.File, cap(f.filesCh)) + f.formattedCh = make(chan *walker.File, cap(f.fileCh)) // create a channel for files that have been processed - f.processedCh = make(chan *walk.File, cap(f.filesCh)) + f.processedCh = make(chan *walker.File, cap(f.fileCh)) // start concurrent processing tasks in reverse order eg.Go(f.updateCache(ctx)) @@ -168,14 +168,14 @@ func (f *Format) Run() (err error) { func (f *Format) walkFilesystem(ctx context.Context) func() error { return func() error { eg, ctx := errgroup.WithContext(ctx) - pathsCh := make(chan string, BatchSize) + pathCh := make(chan string, BatchSize) // By default, we use the cli arg, but if the stdin flag has been set we force a filesystem walk // since we will only be processing one file from a temp directory walkerType := f.Walk if f.Stdin { - walkerType = walk.Filesystem + walkerType = walker.Filesystem // check we have only received one path arg which we use for the file extension / matching to formatters if len(f.Paths) != 1 { @@ -197,7 +197,7 @@ func (f *Format) walkFilesystem(ctx context.Context) func() error { } walkPaths := func() error { - defer close(pathsCh) + defer close(pathCh) var idx int for idx < len(f.Paths) { @@ -205,7 +205,7 @@ func (f *Format) walkFilesystem(ctx context.Context) func() error { case <-ctx.Done(): return ctx.Err() default: - pathsCh <- f.Paths[idx] + pathCh <- f.Paths[idx] idx += 1 } } @@ -217,29 +217,29 @@ func (f *Format) walkFilesystem(ctx context.Context) func() error { eg.Go(walkPaths) } else { // no explicit paths to process, so we only need to process root - pathsCh <- f.TreeRoot - close(pathsCh) + pathCh <- f.TreeRoot + close(pathCh) } // create a filesystem walker - walker, err := walk.New(walkerType, f.TreeRoot, pathsCh) + wk, err := walker.New(walkerType, f.TreeRoot, f.NoCache, pathCh) if err != nil { return fmt.Errorf("failed to create walker: %w", err) } - // close the files channel when we're done walking the file system - defer close(f.filesCh) + // close the file channel when we're done walking the file system + defer close(f.fileCh) // if no cache has been configured, or we are processing from stdin, we invoke the walker directly if f.NoCache || f.Stdin { - return walker.Walk(ctx, func(file *walk.File, err error) error { + return wk.Walk(ctx, func(file *walker.File, err error) error { select { case <-ctx.Done(): return ctx.Err() default: stats.Add(stats.Traversed, 1) stats.Add(stats.Emitted, 1) - f.filesCh <- file + f.fileCh <- file return nil } }) @@ -247,7 +247,7 @@ func (f *Format) walkFilesystem(ctx context.Context) func() error { // otherwise we pass the walker to the cache and have it generate files for processing based on whether or not // they have been added/changed since the last invocation - if err = cache.ChangeSet(ctx, walker, f.filesCh); err != nil { + if err = cache.ChangeSet(ctx, wk, f.fileCh); err != nil { return fmt.Errorf("failed to generate change set: %w", err) } return nil @@ -319,7 +319,7 @@ func (f *Format) applyFormatters(ctx context.Context) func() error { }() // iterate the files channel - for file := range f.filesCh { + for file := range f.fileCh { // first check if this file has been globally excluded if format.PathMatches(file.RelPath, f.globalExcludes) { @@ -419,7 +419,7 @@ func (f *Format) detectFormatted(ctx context.Context) func() error { func (f *Format) updateCache(ctx context.Context) func() error { return func() error { // used to batch updates for more efficient txs - batch := make([]*walk.File, 0, BatchSize) + batch := make([]*walker.File, 0, BatchSize) // apply a batch processBatch := func() error { diff --git a/format/formatter.go b/format/formatter.go index 36687393..4e611f44 100644 --- a/format/formatter.go +++ b/format/formatter.go @@ -8,7 +8,7 @@ import ( "os/exec" "time" - "git.numtide.com/numtide/treefmt/walk" + "git.numtide.com/numtide/treefmt/walker" "git.numtide.com/numtide/treefmt/config" @@ -89,7 +89,7 @@ func (f *Formatter) Apply(ctx context.Context, tasks []*Task) error { // Wants is used to test if a Formatter wants a path based on it's configured Includes and Excludes patterns. // Returns true if the Formatter should be applied to path, false otherwise. -func (f *Formatter) Wants(file *walk.File) bool { +func (f *Formatter) Wants(file *walker.File) bool { match := !PathMatches(file.RelPath, f.excludes) && PathMatches(file.RelPath, f.includes) if match { f.log.Debugf("match: %v", file) diff --git a/format/task.go b/format/task.go index 062b4544..81d16602 100644 --- a/format/task.go +++ b/format/task.go @@ -4,16 +4,16 @@ import ( "cmp" "slices" - "git.numtide.com/numtide/treefmt/walk" + "git.numtide.com/numtide/treefmt/walker" ) type Task struct { - File *walk.File + File *walker.File Formatters []*Formatter BatchKey string } -func NewTask(file *walk.File, formatters []*Formatter) Task { +func NewTask(file *walker.File, formatters []*Formatter) Task { // sort by priority in ascending order slices.SortFunc(formatters, func(a, b *Formatter) int { priorityA := a.Priority() diff --git a/walk/filesystem.go b/walker/filesystem.go similarity index 98% rename from walk/filesystem.go rename to walker/filesystem.go index 729a4972..8a47c7af 100644 --- a/walk/filesystem.go +++ b/walker/filesystem.go @@ -1,4 +1,4 @@ -package walk +package walker import ( "context" diff --git a/walk/filesystem_test.go b/walker/filesystem_test.go similarity index 99% rename from walk/filesystem_test.go rename to walker/filesystem_test.go index 503ec0d3..1edaa170 100644 --- a/walk/filesystem_test.go +++ b/walker/filesystem_test.go @@ -1,4 +1,4 @@ -package walk +package walker import ( "context" diff --git a/walk/git.go b/walker/git.go similarity index 69% rename from walk/git.go rename to walker/git.go index 4dc91967..bbf30652 100644 --- a/walk/git.go +++ b/walker/git.go @@ -1,4 +1,4 @@ -package walk +package walker import ( "context" @@ -6,6 +6,9 @@ import ( "io/fs" "os" "path/filepath" + "time" + + "github.com/go-git/go-git/v5/plumbing/format/index" "github.com/charmbracelet/log" @@ -13,9 +16,11 @@ import ( ) type gitWalker struct { - root string - paths chan string - repo *git.Repository + root string + paths chan string + repo *git.Repository + + noCache bool relPathOffset int } @@ -39,7 +44,20 @@ func (g gitWalker) Walk(ctx context.Context, fn WalkFunc) error { } // cache in-memory whether a path is present in the git index - var cache map[string]bool + var cache map[string]*index.Entry + + // by default, we only emit files if they have changes when compared with the git index + emitFile := func(entry *index.Entry, info os.FileInfo) bool { + // mod time comparison is done with EPOCH (second) precision as per the POSIX spec + return entry.ModifiedAt.Truncate(time.Second) != info.ModTime().Truncate(time.Second) + } + + if g.noCache { + // emit all files in the index + emitFile = func(entry *index.Entry, info os.FileInfo) bool { + return true + } + } for path := range g.paths { @@ -63,6 +81,11 @@ func (g gitWalker) Walk(ctx context.Context, fn WalkFunc) error { return fmt.Errorf("failed to stat %s: %w", path, err) } + // skip processing if the file hasn't changed + if !emitFile(entry, info) { + continue + } + // determine a relative path relPath, err := g.relPath(path) if err != nil { @@ -83,11 +106,11 @@ func (g gitWalker) Walk(ctx context.Context, fn WalkFunc) error { continue } - // otherwise we ensure the git index entries are cached and then check if they are in the git index + // otherwise we ensure the git index entries are cached and then check if the path is in the git index if cache == nil { - cache = make(map[string]bool) + cache = make(map[string]*index.Entry) for _, entry := range idx.Entries { - cache[entry.Name] = true + cache[entry.Name] = entry } } @@ -103,7 +126,8 @@ func (g gitWalker) Walk(ctx context.Context, fn WalkFunc) error { } return filepath.Walk(path, func(path string, info fs.FileInfo, _ error) error { - if info.IsDir() { + // ignore directories and symlinks + if info.IsDir() || info.Mode()&os.ModeSymlink == os.ModeSymlink { return nil } @@ -112,9 +136,12 @@ func (g gitWalker) Walk(ctx context.Context, fn WalkFunc) error { return fmt.Errorf("failed to determine a relative path for %s: %w", path, err) } - if _, ok := cache[relPath]; !ok { + if entry, ok := cache[relPath]; !ok { log.Debugf("path %v not found in git index, skipping", path) return nil + } else if !emitFile(entry, info) { + log.Debugf("path %v has not changed, skipping", path) + return nil } file := File{ @@ -130,7 +157,11 @@ func (g gitWalker) Walk(ctx context.Context, fn WalkFunc) error { return nil } -func NewGit(root string, paths chan string) (Walker, error) { +func NewGit( + root string, + noCache bool, + paths chan string, +) (Walker, error) { repo, err := git.PlainOpen(root) if err != nil { return nil, fmt.Errorf("failed to open git repo: %w", err) @@ -139,6 +170,7 @@ func NewGit(root string, paths chan string) (Walker, error) { root: root, paths: paths, repo: repo, + noCache: noCache, relPathOffset: len(root) + 1, }, nil } diff --git a/walk/walker.go b/walker/walker.go similarity index 83% rename from walk/walker.go rename to walker/walker.go index e3afb77f..270854cf 100644 --- a/walk/walker.go +++ b/walker/walker.go @@ -1,4 +1,4 @@ -package walk +package walker import ( "context" @@ -56,12 +56,12 @@ type Walker interface { Walk(ctx context.Context, fn WalkFunc) error } -func New(walkerType Type, root string, pathsCh chan string) (Walker, error) { +func New(walkerType Type, root string, noCache bool, pathsCh chan string) (Walker, error) { switch walkerType { case Git: - return NewGit(root, pathsCh) + return NewGit(root, noCache, pathsCh) case Auto: - return Detect(root, pathsCh) + return Detect(root, noCache, pathsCh) case Filesystem: return NewFilesystem(root, pathsCh) default: @@ -69,9 +69,9 @@ func New(walkerType Type, root string, pathsCh chan string) (Walker, error) { } } -func Detect(root string, pathsCh chan string) (Walker, error) { +func Detect(root string, noCache bool, pathsCh chan string) (Walker, error) { // for now, we keep it simple and try git first, filesystem second - w, err := NewGit(root, pathsCh) + w, err := NewGit(root, noCache, pathsCh) if err == nil { return w, err } From 9cfe7db2f582ba74c493400ada6c9c93ac219f6b Mon Sep 17 00:00:00 2001 From: Brian McGee Date: Sat, 13 Jul 2024 15:55:25 +0100 Subject: [PATCH 2/3] feat: refactor cache usage Signed-off-by: Brian McGee --- cache/cache.go | 361 ++++++++++++---------------------- cli/cli.go | 4 + cli/format.go | 90 +++++---- cli/format_test.go | 6 +- format/cache.go | 93 +++++++++ go.sum | 20 +- test/examples/ruby/bundler.rb | 4 +- walker/filesystem.go | 77 +++++++- walker/filesystem_test.go | 10 +- walker/git.go | 31 ++- walker/walker.go | 22 ++- 11 files changed, 407 insertions(+), 311 deletions(-) create mode 100644 format/cache.go diff --git a/cache/cache.go b/cache/cache.go index 3d9b3c59..a469caa5 100644 --- a/cache/cache.go +++ b/cache/cache.go @@ -1,293 +1,192 @@ package cache import ( - "context" "crypto/sha1" "encoding/hex" "fmt" "os" - "runtime" "time" - "git.numtide.com/numtide/treefmt/stats" - - "git.numtide.com/numtide/treefmt/format" - "git.numtide.com/numtide/treefmt/walker" - - "github.com/charmbracelet/log" - "github.com/adrg/xdg" + "github.com/charmbracelet/log" "github.com/vmihailenco/msgpack/v5" bolt "go.etcd.io/bbolt" ) const ( - pathsBucket = "paths" - formattersBucket = "formatters" + bucketPaths = "paths" + bucketFormatters = "formatters" ) -// Entry represents a cache entry, indicating the last size and modified time for a file path. type Entry struct { Size int64 Modified time.Time } -var ( - db *bolt.DB - logger *log.Logger - - ReadBatchSize = 1024 * runtime.NumCPU() -) - -// Open creates an instance of bolt.DB for a given treeRoot path. -// If clean is true, Open will delete any existing data in the cache. -// -// The database will be located in `XDG_CACHE_DIR/treefmt/eval-cache/.db`, where is determined by hashing -// the treeRoot path. This associates a given treeRoot with a given instance of the cache. -func Open(treeRoot string, clean bool, formatters map[string]*format.Formatter) (err error) { - logger = log.WithPrefix("cache") - - // determine a unique and consistent db name for the tree root - h := sha1.New() - h.Write([]byte(treeRoot)) - digest := h.Sum(nil) +type Bucket[V any] struct { + bucket *bolt.Bucket +} - name := hex.EncodeToString(digest) - path, err := xdg.CacheFile(fmt.Sprintf("treefmt/eval-cache/%v.db", name)) - if err != nil { - return fmt.Errorf("could not resolve local path for the cache: %w", err) +func (b *Bucket[V]) Get(key string) (*V, error) { + bytes := b.bucket.Get([]byte(key)) + if bytes == nil { + return nil, nil + } + var value V + if err := msgpack.Unmarshal(bytes, &value); err != nil { + return nil, fmt.Errorf("failed to unmarshal cache entry for key '%v': %w", key, err) } + return &value, nil +} - db, err = bolt.Open(path, 0o600, nil) - if err != nil { - return fmt.Errorf("failed to open cache at %v: %w", path, err) +func (b *Bucket[V]) Put(key string, value *V) error { + if bytes, err := msgpack.Marshal(value); err != nil { + return fmt.Errorf("failed to marshal cache entry for key %v: %w", key, err) + } else if err = b.bucket.Put([]byte(key), bytes); err != nil { + return fmt.Errorf("failed to put cache entry for key %v: %w", key, err) } + return nil +} - err = db.Update(func(tx *bolt.Tx) error { - // create bucket for tracking paths - pathsBucket, err := tx.CreateBucketIfNotExists([]byte(pathsBucket)) - if err != nil { - return fmt.Errorf("failed to create paths bucket: %w", err) - } +func (b *Bucket[V]) Delete(key string) error { + return b.bucket.Delete([]byte(key)) +} - // create bucket for tracking formatters - formattersBucket, err := tx.CreateBucketIfNotExists([]byte(formattersBucket)) - if err != nil { - return fmt.Errorf("failed to create formatters bucket: %w", err) +func (b *Bucket[V]) DeleteAll() error { + c := b.bucket.Cursor() + for k, v := c.First(); !(k == nil && v == nil); k, v = c.Next() { + if err := c.Delete(); err != nil { + return fmt.Errorf("failed to remove cache entry for key %s: %w", string(k), err) } + } + return nil +} - // check for any newly configured or modified formatters - for name, formatter := range formatters { - - stat, err := os.Lstat(formatter.Executable()) - if err != nil { - return fmt.Errorf("failed to stat formatter executable %v: %w", formatter.Executable(), err) - } - - entry, err := getEntry(formattersBucket, name) - if err != nil { - return fmt.Errorf("failed to retrieve cache entry for formatter %v: %w", name, err) - } - - isNew := entry == nil - hasChanged := entry != nil && !(entry.Size == stat.Size() && entry.Modified == stat.ModTime()) - - if isNew { - logger.Debugf("formatter '%s' is new", name) - } else if hasChanged { - logger.Debug("formatter '%s' has changed", - name, - "size", stat.Size(), - "modTime", stat.ModTime(), - "cachedSize", entry.Size, - "cachedModTime", entry.Modified, - ) - } - - // update overall clean flag - clean = clean || isNew || hasChanged - - // record formatters info - entry = &Entry{ - Size: stat.Size(), - Modified: stat.ModTime(), - } - - if err = putEntry(formattersBucket, name, entry); err != nil { - return fmt.Errorf("failed to write cache entry for formatter %v: %w", name, err) - } +func (b *Bucket[V]) ForEach(f func(string, *V) error) error { + return b.bucket.ForEach(func(key, bytes []byte) error { + var value V + if err := msgpack.Unmarshal(bytes, &value); err != nil { + return fmt.Errorf("failed to unmarshal cache entry for key '%v': %w", key, err) } + return f(string(key), &value) + }) +} - // check for any removed formatters - if err = formattersBucket.ForEach(func(key []byte, _ []byte) error { - _, ok := formatters[string(key)] - if !ok { - // remove the formatter entry from the cache - if err = formattersBucket.Delete(key); err != nil { - return fmt.Errorf("failed to remove cache entry for formatter %v: %w", key, err) - } - // indicate a clean is required - clean = true - } - return nil - }); err != nil { - return fmt.Errorf("failed to check cache for removed formatters: %w", err) - } +type Tx struct { + tx *bolt.Tx +} - if clean { - // remove all path entries - c := pathsBucket.Cursor() - for k, v := c.First(); !(k == nil && v == nil); k, v = c.Next() { - if err = c.Delete(); err != nil { - return fmt.Errorf("failed to remove path entry: %w", err) - } - } - } +func (t *Tx) Commit() error { + return t.tx.Commit() +} - return nil - }) +func (t *Tx) Rollback() error { + return t.tx.Rollback() +} - return +func (t *Tx) Paths() (*Bucket[Entry], error) { + return t.cacheBucket(bucketPaths) } -// Close closes any open instance of the cache. -func Close() error { - if db == nil { - return nil - } - return db.Close() +func (t *Tx) Formatters() (*Bucket[Entry], error) { + return t.cacheBucket(bucketFormatters) } -// getEntry is a helper for reading cache entries from bolt. -func getEntry(bucket *bolt.Bucket, path string) (*Entry, error) { - b := bucket.Get([]byte(path)) - if b != nil { - var cached Entry - if err := msgpack.Unmarshal(b, &cached); err != nil { - return nil, fmt.Errorf("failed to unmarshal cache info for path '%v': %w", path, err) - } - return &cached, nil +func (t *Tx) cacheBucket(name string) (*Bucket[Entry], error) { + var b *bolt.Bucket + var err error + if t.tx.Writable() { + b, err = t.tx.CreateBucketIfNotExists([]byte(name)) } else { - return nil, nil + b = t.tx.Bucket([]byte(name)) } -} - -// putEntry is a helper for writing cache entries into bolt. -func putEntry(bucket *bolt.Bucket, path string, entry *Entry) error { - bytes, err := msgpack.Marshal(entry) if err != nil { - return fmt.Errorf("failed to marshal cache path %v: %w", path, err) + return nil, fmt.Errorf("failed to get/create bucket %s: %w", bucketPaths, err) } - - if err = bucket.Put([]byte(path), bytes); err != nil { - return fmt.Errorf("failed to put cache path %v: %w", path, err) - } - return nil + return &Bucket[Entry]{b}, nil } -// ChangeSet is used to walk a filesystem, starting at root, and outputting any new or changed paths using pathsCh. -// It determines if a path is new or has changed by comparing against cache entries. -func ChangeSet(ctx context.Context, wk walker.Walker, filesCh chan<- *walker.File) error { - start := time.Now() +type Cache struct { + db *bolt.DB + Temporary bool +} - defer func() { - logger.Debugf("finished generating change set in %v", time.Since(start)) - }() +func (c *Cache) BeginTx(writeable bool) (*Tx, error) { + tx, err := c.db.Begin(writeable) + return &Tx{tx}, err +} - var tx *bolt.Tx - var bucket *bolt.Bucket - var processed int +func (c *Cache) View(f func(*Tx) error) error { + return c.db.View(func(tx *bolt.Tx) error { + return f(&Tx{tx}) + }) +} - defer func() { - // close any pending read tx - if tx != nil { - _ = tx.Rollback() - } - }() +func (c *Cache) Update(f func(*Tx) error) error { + return c.db.Update(func(tx *bolt.Tx) error { + return f(&Tx{tx}) + }) +} - return wk.Walk(ctx, func(file *walker.File, err error) error { - select { - case <-ctx.Done(): - return ctx.Err() - default: - if err != nil { - return fmt.Errorf("failed to walk path: %w", err) - } else if file.Info.IsDir() { - // ignore directories - return nil - } - } +func (c *Cache) Close() error { + path := c.db.Path() - // open a new read tx if there isn't one in progress - // we have to periodically open a new read tx to prevent writes from being blocked - if tx == nil { - tx, err = db.Begin(false) - if err != nil { - return fmt.Errorf("failed to open a new cache read tx: %w", err) + // if this is a temporary cache instance, clean up the db file after closing + if c.Temporary { + defer func() { + if err := os.Remove(path); err != nil { + log.Errorf("failed to remove temporary cache file %s: %v", path, err) } - bucket = tx.Bucket([]byte(pathsBucket)) - } - - cached, err := getEntry(bucket, file.RelPath) - if err != nil { - return err - } - - changedOrNew := cached == nil || !(cached.Modified == file.Info.ModTime() && cached.Size == file.Info.Size()) - - stats.Add(stats.Traversed, 1) - if !changedOrNew { - // no change - return nil - } + log.Debugf("successfully removed temporary cache file %s", path) + }() + } - stats.Add(stats.Emitted, 1) + return c.db.Close() +} - // pass on the path - select { - case <-ctx.Done(): - return ctx.Err() - default: - filesCh <- file +func Open(treeRoot string, noCache bool) (*Cache, error) { + var err error + var path string + + if noCache { + // If no cache is desired, rather than complicate the logic for cache / no cache, we just create a cache instance + // backed by a temporary file instead + if f, err := os.CreateTemp("", "treefmt-no-cache-*.db"); err != nil { + return nil, fmt.Errorf("failed to create a temporary db file: %w", err) + } else { + path = f.Name() } - - // close the current tx if we have reached the batch size - processed += 1 - if processed == ReadBatchSize { - err = tx.Rollback() - tx = nil - return err + } else { + // Otherwise, the database will be located in `XDG_CACHE_DIR/treefmt/eval-cache/.db`, where is + // determined by hashing the treeRoot path. + // This associates a given treeRoot with a given instance of the cache. + h := sha1.New() + h.Write([]byte(treeRoot)) + digest := h.Sum(nil) + + name := hex.EncodeToString(digest) + if path, err = xdg.CacheFile(fmt.Sprintf("treefmt/eval-cache/%v.db", name)); err != nil { + return nil, fmt.Errorf("could not resolve local path for the cache: %w", err) } - - return nil - }) -} - -// Update is used to record updated cache information for the specified list of paths. -func Update(files []*walker.File) error { - start := time.Now() - defer func() { - logger.Debugf("finished processing %v paths in %v", len(files), time.Since(start)) - }() - - if len(files) == 0 { - return nil } - return db.Update(func(tx *bolt.Tx) error { - bucket := tx.Bucket([]byte(pathsBucket)) + // open db + db, err := bolt.Open(path, 0o600, nil) + if err != nil { + return nil, err + } - for _, f := range files { - entry := Entry{ - Size: f.Info.Size(), - Modified: f.Info.ModTime(), - } + c := &Cache{ + db: db, + Temporary: noCache, + } - if err := putEntry(bucket, f.RelPath, &entry); err != nil { - return err - } + // force creation of buckets if they don't already exist + return c, c.Update(func(tx *Tx) error { + if _, err := tx.Paths(); err != nil { + return err } - - return nil + _, err = tx.Formatters() + return err }) } diff --git a/cli/cli.go b/cli/cli.go index 655e670a..fd3a00c9 100644 --- a/cli/cli.go +++ b/cli/cli.go @@ -1,6 +1,7 @@ package cli import ( + "git.numtide.com/numtide/treefmt/cache" "os" "github.com/gobwas/glob" @@ -41,6 +42,9 @@ type Format struct { formatters map[string]*format.Formatter globalExcludes []glob.Glob + cache *cache.Cache + + walker walker.Walker fileCh chan *walker.File formattedCh chan *walker.File processedCh chan *walker.File diff --git a/cli/format.go b/cli/format.go index 34cbfdb4..c4f81ec3 100644 --- a/cli/format.go +++ b/cli/format.go @@ -13,10 +13,11 @@ import ( "syscall" "time" + "git.numtide.com/numtide/treefmt/cache" + "git.numtide.com/numtide/treefmt/format" "git.numtide.com/numtide/treefmt/stats" - "git.numtide.com/numtide/treefmt/cache" "git.numtide.com/numtide/treefmt/config" "git.numtide.com/numtide/treefmt/walker" @@ -53,13 +54,6 @@ func (f *Format) Run() (err error) { // create a prefixed logger log.SetPrefix("format") - // ensure cache is closed on return - defer func() { - if err := cache.Close(); err != nil { - log.Errorf("failed to close cache: %v", err) - } - }() - // find the config file unless specified if f.ConfigFile == "" { pwd, err := os.Getwd() @@ -119,12 +113,37 @@ func (f *Format) Run() (err error) { f.formatters[name] = formatter } - // open the cache if configured - if !f.NoCache { - if err = cache.Open(f.TreeRoot, f.ClearCache, f.formatters); err != nil { - // if we can't open the cache, we log a warning and fallback to no cache - log.Warnf("failed to open cache: %v", err) - f.NoCache = true + // initialise the cache + f.cache, err = cache.Open(f.TreeRoot, f.NoCache) + if err != nil { + return fmt.Errorf("failed to open cache: %w", err) + } + + // ensure the cache is closed on shutdown + defer func() { + if err := f.cache.Close(); err != nil { + log.Errorf("failed to close cache: %v", err) + } + log.Debug("successfully closed cache") + }() + + // check if the formatters set has changed in any meaningful way + if err = format.CheckFormatters(f.cache, f.formatters); err != nil { + return fmt.Errorf("failed to check formatters: %w", err) + } + log.Debugf("formatters check complete") + + // clear out the paths bucket if desired before starting + if f.ClearCache { + err = f.cache.Update(func(tx *cache.Tx) error { + paths, err := tx.Paths() + if err != nil { + return err + } + return paths.DeleteAll() + }) + if err != nil { + return fmt.Errorf("failed to clear paths from cache: %w", err) } } @@ -222,7 +241,8 @@ func (f *Format) walkFilesystem(ctx context.Context) func() error { } // create a filesystem walker - wk, err := walker.New(walkerType, f.TreeRoot, f.NoCache, pathCh) + var err error + f.walker, err = walker.New(walkerType, f.TreeRoot, f.cache, pathCh) if err != nil { return fmt.Errorf("failed to create walker: %w", err) } @@ -230,27 +250,17 @@ func (f *Format) walkFilesystem(ctx context.Context) func() error { // close the file channel when we're done walking the file system defer close(f.fileCh) - // if no cache has been configured, or we are processing from stdin, we invoke the walker directly - if f.NoCache || f.Stdin { - return wk.Walk(ctx, func(file *walker.File, err error) error { - select { - case <-ctx.Done(): - return ctx.Err() - default: - stats.Add(stats.Traversed, 1) - stats.Add(stats.Emitted, 1) - f.fileCh <- file - return nil - } - }) - } - - // otherwise we pass the walker to the cache and have it generate files for processing based on whether or not - // they have been added/changed since the last invocation - if err = cache.ChangeSet(ctx, wk, f.fileCh); err != nil { - return fmt.Errorf("failed to generate change set: %w", err) - } - return nil + // + return f.walker.Walk(ctx, func(file *walker.File, err error) error { + select { + case <-ctx.Done(): + return ctx.Err() + default: + stats.Add(stats.Emitted, 1) + f.fileCh <- file + return nil + } + }) } } @@ -423,17 +433,17 @@ func (f *Format) updateCache(ctx context.Context) func() error { // apply a batch processBatch := func() error { - // pass the batch to the cache for updating - if err := cache.Update(batch); err != nil { + // let the walker record updated path info + if err := f.walker.UpdatePaths(batch); err != nil { return err } + // reset the batch batch = batch[:0] return nil } // if we are processing from stdin that means we are outputting to stdout, no caching involved - // if f.NoCache is set that means either the user explicitly disabled the cache or we failed to open on - if f.Stdin || f.NoCache { + if f.Stdin { // do nothing processBatch = func() error { return nil } } diff --git a/cli/format_test.go b/cli/format_test.go index 1d6d75dd..11a77e20 100644 --- a/cli/format_test.go +++ b/cli/format_test.go @@ -412,7 +412,7 @@ func TestBustCacheOnFormatterChange(t *testing.T) { } test.WriteConfig(t, configPath, cfg) - args := []string{"--config-file", configPath, "--tree-root", tempDir} + args := []string{"--config-file", configPath, "--tree-root", tempDir, "-vv"} _, err := cmd(t, args...) as.NoError(err) assertStats(t, as, 32, 32, 3, 0) @@ -425,8 +425,8 @@ func TestBustCacheOnFormatterChange(t *testing.T) { assertStats(t, as, 32, 32, 3, 0) // check cache is working - _, err = cmd(t, args...) - as.NoError(err) + out, err := cmd(t, args...) + as.NoError(err, string(out)) assertStats(t, as, 32, 0, 0, 0) // tweak mod time of python formatter diff --git a/format/cache.go b/format/cache.go new file mode 100644 index 00000000..51278015 --- /dev/null +++ b/format/cache.go @@ -0,0 +1,93 @@ +package format + +import ( + "fmt" + "git.numtide.com/numtide/treefmt/cache" + "github.com/charmbracelet/log" + "os" +) + +func CheckFormatters(c *cache.Cache, formatters map[string]*Formatter) error { + return c.Update(func(tx *cache.Tx) error { + + clearPaths := false + + pathsBucket, err := tx.Paths() + if err != nil { + return fmt.Errorf("failed to get paths bucket from cache: %w", err) + } + + formattersBucket, err := tx.Formatters() + if err != nil { + return fmt.Errorf("failed to get formatters bucket from cache: %w", err) + } + + // check for any newly configured or modified formatters + for name, formatter := range formatters { + + stat, err := os.Lstat(formatter.Executable()) + if err != nil { + return fmt.Errorf("failed to stat formatter executable %v: %w", formatter.Executable(), err) + } + + entry, err := formattersBucket.Get(name) + if err != nil { + return fmt.Errorf("failed to retrieve cache entry for formatter %v: %w", name, err) + } + + isNew := entry == nil + hasChanged := entry != nil && !(entry.Size == stat.Size() && entry.Modified == stat.ModTime()) + + if isNew { + log.Debugf("formatter '%s' is new", name) + } else if hasChanged { + log.Debug("formatter '%s' has changed", + name, + "size", stat.Size(), + "modTime", stat.ModTime(), + "cachedSize", entry.Size, + "cachedModTime", entry.Modified, + ) + } + + // update overall flag + clearPaths = clearPaths || isNew || hasChanged + + // record formatters info + entry = &cache.Entry{ + Size: stat.Size(), + Modified: stat.ModTime(), + } + + if err = formattersBucket.Put(name, entry); err != nil { + return fmt.Errorf("failed to write cache entry for formatter %v: %w", name, err) + } + } + + // check for any removed formatters + if err = formattersBucket.ForEach(func(key string, _ *cache.Entry) error { + _, ok := formatters[key] + if !ok { + // remove the formatter entry from the cache + if err = formattersBucket.Delete(key); err != nil { + return fmt.Errorf("failed to remove cache entry for formatter %v: %w", key, err) + } + // indicate a clean is required + clearPaths = true + } + return nil + }); err != nil { + return fmt.Errorf("failed to check cache for removed formatters: %w", err) + } + + if clearPaths { + // remove all path entries + if err := pathsBucket.DeleteAll(); err != nil { + return fmt.Errorf("failed to remove all path entries from cache: %w", err) + } + } + + return nil + }) + +} diff --git a/go.sum b/go.sum index 60fc1785..1f0c0f64 100644 --- a/go.sum +++ b/go.sum @@ -46,12 +46,8 @@ github.com/go-git/go-billy/v5 v5.5.0 h1:yEY4yhzCDuMGSv83oGxiBotRzhwhNr8VZyphhiu+ github.com/go-git/go-billy/v5 v5.5.0/go.mod h1:hmexnoNsr2SJU1Ju67OaNz5ASJY3+sHgFRpCtpDCKow= github.com/go-git/go-git-fixtures/v4 v4.3.2-0.20231010084843-55a94097c399 h1:eMje31YglSBqCdIqdhKBW8lokaMrL3uTkpGYlE2OOT4= github.com/go-git/go-git-fixtures/v4 v4.3.2-0.20231010084843-55a94097c399/go.mod h1:1OCfN199q1Jm3HZlxleg+Dw/mwps2Wbk9frAWm+4FII= -github.com/go-git/go-git/v5 v5.12.0 h1:7Md+ndsjrzZxbddRDZjF14qK+NN56sy6wkqaVrjZtys= -github.com/go-git/go-git/v5 v5.12.0/go.mod h1:FTM9VKtnI2m65hNI/TenDDDnUf2Q9FHnXYjuz9i5OEY= github.com/go-git/go-git/v5 v5.12.1-0.20240409060936-cd6633c3c665 h1:+M6D6MplKwRqmw8b41MArUGFsGTnl24+/S40I0yrhKs= github.com/go-git/go-git/v5 v5.12.1-0.20240409060936-cd6633c3c665/go.mod h1:QZbSbsaXQD7v0yvddAhVE2UfW5wGmeqHQ0UnOSr6JYQ= -github.com/go-git/go-git/v5 v5.12.1-0.20240516215126-9cc340a7fc5c h1:Pyuh3Y6kb/+k7yl5nLN6pcQ+7isiJ1PnBj3QSUH1hbQ= -github.com/go-git/go-git/v5 v5.12.1-0.20240516215126-9cc340a7fc5c/go.mod h1:Tzg+feu/PVazFrRFtWHxfvErT+p+Zo3Yg0nzEhJdxW4= github.com/go-logfmt/logfmt v0.6.0 h1:wGYYu3uicYdqXVgoYbvnkrPVXkuLM1p1ifugDMEdRi4= github.com/go-logfmt/logfmt v0.6.0/go.mod h1:WYhtIu8zTZfxdn5+rREduYbwxfcBr/Vr6KEVveWlfTs= github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= @@ -120,8 +116,6 @@ github.com/vmihailenco/tagparser/v2 v2.0.0/go.mod h1:Wri+At7QHww0WTrCBeu4J6bNtoV github.com/xanzy/ssh-agent v0.3.3 h1:+/15pJfg/RsTxqYcX6fHqOXZwwMP+2VyYWJeWM2qQFM= github.com/xanzy/ssh-agent v0.3.3/go.mod h1:6dzNDKs0J9rVPHPhaGCukekBHKqfl+L3KghI1Bc68Uw= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= -go.etcd.io/bbolt v1.3.9 h1:8x7aARPEXiXbHmtUwAIv7eV2fQFHrLLavdiJ3uzJXoI= -go.etcd.io/bbolt v1.3.9/go.mod h1:zaO32+Ti0PK1ivdPtgMESzuzL2VPoIG1PCQNvOdo/dE= go.etcd.io/bbolt v1.3.10 h1:+BqfJTcCzTItrop8mq/lbzL8wSGtj94UO/3U31shqG0= go.etcd.io/bbolt v1.3.10/go.mod h1:bK3UQLPJZly7IlNmV7uVHJDxfe5aK9Ll93e/74Y9oEQ= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= @@ -129,12 +123,8 @@ golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5y golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.3.1-0.20221117191849-2c476679df9a/go.mod h1:hebNnKkNXi2UzZN1eVRvBB7co0a+JxK6XbPiWVs/3J4= golang.org/x/crypto v0.7.0/go.mod h1:pYwdfH91IfpZVANVyUOhSIPZaFoJGxTFbZhFTx+dXZU= -golang.org/x/crypto v0.22.0 h1:g1v0xeRhjcugydODzvb3mEM9SQ0HGp9s/nh3COQ/C30= -golang.org/x/crypto v0.22.0/go.mod h1:vr6Su+7cTlO45qkww3VDJlzDn0ctJvRgYbC2NvXHt+M= golang.org/x/crypto v0.23.0 h1:dIJU/v2J8Mdglj/8rJ6UUOM3Zc9zLZxVZwwxMooUSAI= golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= -golang.org/x/exp v0.0.0-20240416160154-fe59bbe5cc7f h1:99ci1mjWVBWwJiEKYY6jWa4d2nTQVIEhZIptnrVb1XY= -golang.org/x/exp v0.0.0-20240416160154-fe59bbe5cc7f/go.mod h1:/lliqkxwWAhPjf5oSOIJup2XcqJaw8RGS6k3TGEc7GI= golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842 h1:vr/HnozRka3pE4EsMEg1lgkXJkTFJCVUX+S/ZT6wYzM= golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842/go.mod h1:XtvwrStGgqGPLc4cjQfWqZHG1YFdYs6swckp8vpsjnc= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= @@ -146,8 +136,6 @@ golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug golang.org/x/net v0.2.0/go.mod h1:KqCZLdyyvdV855qA2rE3GC2aiw5xGR5TEjj8smXukLY= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc= -golang.org/x/net v0.24.0 h1:1PcaxkF854Fu3+lvBIx5SYn9wRlBzzcnHZSiaFFAb0w= -golang.org/x/net v0.24.0/go.mod h1:2Q7sJY5mzlzWjKtYUEXSlBWCdyaioyXzRB2RtU8KVE8= golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac= golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -169,8 +157,6 @@ golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.3.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.19.0 h1:q5f1RH2jigJ1MoAWp2KTp3gm5zAGFUTarQZ5U386+4o= -golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y= golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= @@ -178,9 +164,8 @@ golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuX golang.org/x/term v0.2.0/go.mod h1:TVmDHMZPmdnySmBfhjOoOdhjzdE1h4u1VwSiw2l1Nuc= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U= -golang.org/x/term v0.19.0 h1:+ThwsDv+tYfnJFhF4L8jITxu1tdTWRTZpdsWgEgjL6Q= -golang.org/x/term v0.19.0/go.mod h1:2CuTdWZ7KHSQwUzKva0cbMg6q2DMI3Mmxp+gKJbskEk= golang.org/x/term v0.20.0 h1:VnkxpohqXaOBYJtBmEppKUG6mXpi+4O6purfc2+sMhw= +golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= @@ -188,9 +173,8 @@ golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= -golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= -golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk= +golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= diff --git a/test/examples/ruby/bundler.rb b/test/examples/ruby/bundler.rb index 6a3a14fc..9b177459 100644 --- a/test/examples/ruby/bundler.rb +++ b/test/examples/ruby/bundler.rb @@ -155,11 +155,11 @@ def specs_path end def cache - bundle_path.join("cache/bundler") + bundle_path.join("caching/bundler") end def user_cache - user_bundle_path.join("cache") + user_bundle_path.join("caching") end def root diff --git a/walker/filesystem.go b/walker/filesystem.go index 8a47c7af..a99d7252 100644 --- a/walker/filesystem.go +++ b/walker/filesystem.go @@ -3,17 +3,45 @@ package walker import ( "context" "fmt" + "git.numtide.com/numtide/treefmt/cache" + "git.numtide.com/numtide/treefmt/stats" "io/fs" "os" "path/filepath" + "runtime" ) type filesystemWalker struct { root string + cache *cache.Cache pathsCh chan string relPathOffset int } +func (f filesystemWalker) UpdatePaths(batch []*File) error { + if err := f.cache.Update(func(tx *cache.Tx) error { + // get the paths bucket + paths, err := tx.Paths() + if err != nil { + return err + } + for _, f := range batch { + entry := cache.Entry{ + Size: f.Info.Size(), + Modified: f.Info.ModTime(), + } + if err := paths.Put(f.RelPath, &entry); err != nil { + return err + } + } + + return nil + }); err != nil { + return fmt.Errorf("failed to update paths: %w", err) + } + return nil +} + func (f filesystemWalker) Root() string { return f.root } @@ -28,6 +56,19 @@ func (f filesystemWalker) relPath(path string) (string, error) { } func (f filesystemWalker) Walk(_ context.Context, fn WalkFunc) error { + + var tx *cache.Tx + var paths *cache.Bucket[cache.Entry] + var processed int + batchSize := 1024 * runtime.NumCPU() + + defer func() { + // close any pending read tx + if tx != nil { + _ = tx.Rollback() + } + }() + walkFn := func(path string, info fs.FileInfo, _ error) error { if info == nil { return fmt.Errorf("no such file or directory '%s'", path) @@ -48,6 +89,39 @@ func (f filesystemWalker) Walk(_ context.Context, fn WalkFunc) error { RelPath: relPath, Info: info, } + + // open a new read tx if there isn't one in progress + // we have to periodically open a new read tx to prevent writes from being blocked + if tx == nil { + if tx, err = f.cache.BeginTx(false); err != nil { + return fmt.Errorf("failed to open a new cache read tx: %w", err) + } else if paths, err = tx.Paths(); err != nil { + return fmt.Errorf("failed to get paths bucket from cache tx: %w", err) + } + } + + cached, err := paths.Get(file.RelPath) + if err != nil { + return err + } + + // close the current tx if we have reached the batch size + processed += 1 + if processed == batchSize { + if err = tx.Rollback(); err != nil { + return err + } + tx = nil + } + + // + changedOrNew := cached == nil || !(cached.Modified == file.Info.ModTime() && cached.Size == file.Info.Size()) + + stats.Add(stats.Traversed, 1) + if !changedOrNew { + // no change + return nil + } return fn(&file, err) } @@ -60,9 +134,10 @@ func (f filesystemWalker) Walk(_ context.Context, fn WalkFunc) error { return nil } -func NewFilesystem(root string, paths chan string) (Walker, error) { +func NewFilesystem(root string, cache *cache.Cache, paths chan string) (Walker, error) { return filesystemWalker{ root: root, + cache: cache, pathsCh: paths, relPathOffset: len(root) + 1, }, nil diff --git a/walker/filesystem_test.go b/walker/filesystem_test.go index 1edaa170..c94755c7 100644 --- a/walker/filesystem_test.go +++ b/walker/filesystem_test.go @@ -5,6 +5,9 @@ import ( "os" "testing" + "git.numtide.com/numtide/treefmt/cache" + "git.numtide.com/numtide/treefmt/stats" + "git.numtide.com/numtide/treefmt/test" "github.com/stretchr/testify/require" ) @@ -55,7 +58,12 @@ func TestFilesystemWalker_Walk(t *testing.T) { as := require.New(t) - walker, err := NewFilesystem(tempDir, paths) + stats.Init() + + c, err := cache.Open(t.TempDir(), false) + as.NoError(err, "failed to open cache") + + walker, err := NewFilesystem(tempDir, c, paths) as.NoError(err) idx := 0 diff --git a/walker/git.go b/walker/git.go index bbf30652..4b51c2e2 100644 --- a/walker/git.go +++ b/walker/git.go @@ -3,6 +3,8 @@ package walker import ( "context" "fmt" + "git.numtide.com/numtide/treefmt/cache" + "git.numtide.com/numtide/treefmt/stats" "io/fs" "os" "path/filepath" @@ -24,6 +26,11 @@ type gitWalker struct { relPathOffset int } +func (g gitWalker) UpdatePaths(_ []*File) error { + // nothing to do, git is doing all the tracking + return nil +} + func (g gitWalker) Root() string { return g.root } @@ -44,7 +51,7 @@ func (g gitWalker) Walk(ctx context.Context, fn WalkFunc) error { } // cache in-memory whether a path is present in the git index - var cache map[string]*index.Entry + var indexCache map[string]*index.Entry // by default, we only emit files if they have changes when compared with the git index emitFile := func(entry *index.Entry, info os.FileInfo) bool { @@ -98,6 +105,7 @@ func (g gitWalker) Walk(ctx context.Context, fn WalkFunc) error { Info: info, } + stats.Add(stats.Traversed, 1) if err = fn(&file, err); err != nil { return err } @@ -107,10 +115,10 @@ func (g gitWalker) Walk(ctx context.Context, fn WalkFunc) error { } // otherwise we ensure the git index entries are cached and then check if the path is in the git index - if cache == nil { - cache = make(map[string]*index.Entry) + if indexCache == nil { + indexCache = make(map[string]*index.Entry) for _, entry := range idx.Entries { - cache[entry.Name] = entry + indexCache[entry.Name] = entry } } @@ -119,7 +127,7 @@ func (g gitWalker) Walk(ctx context.Context, fn WalkFunc) error { return fmt.Errorf("failed to find relative path for %v: %w", path, err) } - _, ok := cache[relPath] + _, ok := indexCache[relPath] if !(path == g.root || ok) { log.Debugf("path %v not found in git index, skipping", path) continue @@ -136,7 +144,7 @@ func (g gitWalker) Walk(ctx context.Context, fn WalkFunc) error { return fmt.Errorf("failed to determine a relative path for %s: %w", path, err) } - if entry, ok := cache[relPath]; !ok { + if entry, ok := indexCache[relPath]; !ok { log.Debugf("path %v not found in git index, skipping", path) return nil } else if !emitFile(entry, info) { @@ -150,6 +158,7 @@ func (g gitWalker) Walk(ctx context.Context, fn WalkFunc) error { Info: info, } + stats.Add(stats.Traversed, 1) return fn(&file, err) }) } @@ -159,18 +168,24 @@ func (g gitWalker) Walk(ctx context.Context, fn WalkFunc) error { func NewGit( root string, - noCache bool, + cache *cache.Cache, paths chan string, ) (Walker, error) { repo, err := git.PlainOpen(root) if err != nil { return nil, fmt.Errorf("failed to open git repo: %w", err) } + + // immediately close the cache as we have no need for it + if err = cache.Close(); err != nil { + return nil, fmt.Errorf("failed to close cache: %w", err) + } + return &gitWalker{ root: root, paths: paths, repo: repo, - noCache: noCache, + noCache: cache.Temporary, relPathOffset: len(root) + 1, }, nil } diff --git a/walker/walker.go b/walker/walker.go index 270854cf..84d4b361 100644 --- a/walker/walker.go +++ b/walker/walker.go @@ -3,6 +3,7 @@ package walker import ( "context" "fmt" + "git.numtide.com/numtide/treefmt/cache" "io/fs" "os" "time" @@ -54,26 +55,33 @@ type WalkFunc func(file *File, err error) error type Walker interface { Root() string Walk(ctx context.Context, fn WalkFunc) error + UpdatePaths(batch []*File) error } -func New(walkerType Type, root string, noCache bool, pathsCh chan string) (Walker, error) { +func New( + walkerType Type, + root string, + cache *cache.Cache, + pathsCh chan string, +) (Walker, error) { + switch walkerType { case Git: - return NewGit(root, noCache, pathsCh) + return NewGit(root, cache, pathsCh) case Auto: - return Detect(root, noCache, pathsCh) + return Detect(root, cache, pathsCh) case Filesystem: - return NewFilesystem(root, pathsCh) + return NewFilesystem(root, cache, pathsCh) default: return nil, fmt.Errorf("unknown walker type: %v", walkerType) } } -func Detect(root string, noCache bool, pathsCh chan string) (Walker, error) { +func Detect(root string, cache *cache.Cache, pathsCh chan string) (Walker, error) { // for now, we keep it simple and try git first, filesystem second - w, err := NewGit(root, noCache, pathsCh) + w, err := NewGit(root, cache, pathsCh) if err == nil { return w, err } - return NewFilesystem(root, pathsCh) + return NewFilesystem(root, cache, pathsCh) } From 352b6e8094e93ce86a57d83908fcbc7a89197590 Mon Sep 17 00:00:00 2001 From: Brian McGee Date: Tue, 23 Jul 2024 14:39:49 +0100 Subject: [PATCH 3/3] wip: add cache tests Signed-off-by: Brian McGee --- cache/cache.go | 4 ++ cache/cache_test.go | 166 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 170 insertions(+) create mode 100644 cache/cache_test.go diff --git a/cache/cache.go b/cache/cache.go index a469caa5..b6703cba 100644 --- a/cache/cache.go +++ b/cache/cache.go @@ -27,6 +27,10 @@ type Bucket[V any] struct { bucket *bolt.Bucket } +func (b *Bucket[V]) Size() int { + return b.bucket.Stats().KeyN +} + func (b *Bucket[V]) Get(key string) (*V, error) { bytes := b.bucket.Get([]byte(key)) if bytes == nil { diff --git a/cache/cache_test.go b/cache/cache_test.go new file mode 100644 index 00000000..92cd5f30 --- /dev/null +++ b/cache/cache_test.go @@ -0,0 +1,166 @@ +package cache + +import ( + "os" + "strings" + "testing" + "time" + + "github.com/adrg/xdg" + "github.com/stretchr/testify/require" +) + +func TestCache_Open(t *testing.T) { + as := require.New(t) + + tempDir := t.TempDir() + xdgPrefix, err := xdg.CacheFile("") + as.NoError(err) + + // normal open + cache, err := Open(tempDir, false) + path := cache.db.Path() + + as.NoError(err) + as.True( + strings.HasPrefix(path, xdgPrefix), + "db path %s does not contain the xdg cache file prefix %s", + path, xdgPrefix, + ) + + // normal close + as.NoError(cache.Close()) + _, err = os.Stat(path) + as.NoError(err, "db path %s should still exist after closing the cache", path) + + // open a temp cache e.g. --no-cache + tempDir = t.TempDir() + cache, err = Open(tempDir, true) + as.NoError(err) + + // close temp cache + as.NoError(cache.Close()) + _, err = os.Stat(cache.db.Path()) + as.ErrorIs(err, os.ErrNotExist, "temp db path %s should not exist after closing the cache") +} + +func TestCache_Update(t *testing.T) { + as := require.New(t) + + cache, err := Open(t.TempDir(), false) + as.NoError(err) + + now := time.Now() + + testData := map[string]map[string]*Entry{ + "paths": { + "foo": {Size: 0, Modified: now}, + "bar": {Size: 1, Modified: now.Add(-1 * time.Second)}, + "fizz/buzz": {Size: 1 << 16, Modified: now.Add(-1 * time.Minute)}, + }, + "formatters": { + "bla": {Size: 1 << 32, Modified: now.Add(-1 * time.Hour)}, + "foo/bar/baz": {Size: 1 << 24, Modified: now.Add(-24 * time.Hour)}, + }, + } + + putEntries := func(bucket *Bucket[Entry], err error) func(string) { + return func(name string) { + as.NoError(err) + for k, v := range testData[name] { + as.NoError(bucket.Put(k, v), "failed to put value") + } + } + } + + getEntries := func(bucket *Bucket[Entry], err error) func(string) { + return func(name string) { + as.NoError(err) + as.Equal(len(testData[name]), bucket.Size()) + for k, v := range testData[name] { + actual, err := bucket.Get(k) + as.NoError(err) + as.EqualExportedValues(*v, *actual) + } + } + } + + modifyEntries := func(bucket *Bucket[Entry], err error) func(string) { + return func(name string) { + entries := testData[name] + idx := 0 + for k := range entries { + switch idx { + case 0: + // delete the first entry + as.NoError(bucket.Delete(k)) + delete(entries, k) + case 1: + // change the second + entries[k] = &Entry{Size: 123, Modified: now.Add(-2 * time.Hour)} + as.NoError(bucket.Put(k, entries[k])) + default: + break + } + } + } + } + + clearEntries := func(bucket *Bucket[Entry], err error) { + as.NoError(err) + as.NoError(bucket.DeleteAll()) + } + + checkEmpty := func(bucket *Bucket[Entry], err error) { + as.NoError(err) + as.Equal(0, bucket.Size()) + } + + // insert the test data into the cache + err = cache.Update(func(tx *Tx) error { + putEntries(tx.Paths())("paths") + putEntries(tx.Formatters())("formatters") + return nil + }) + as.NoError(err) + + // read it back and check it matches + err = cache.View(func(tx *Tx) error { + getEntries(tx.Paths())("paths") + getEntries(tx.Formatters())("formatters") + return nil + }) + as.NoError(err) + + // delete and update some entries + err = cache.Update(func(tx *Tx) error { + modifyEntries(tx.Paths()) + modifyEntries(tx.Formatters()) + return nil + }) + as.NoError(err) + + // read them back and make sure they match the updated test data + err = cache.View(func(tx *Tx) error { + getEntries(tx.Paths())("paths") + getEntries(tx.Formatters())("formatters") + return nil + }) + as.NoError(err) + + // delete all + err = cache.Update(func(tx *Tx) error { + clearEntries(tx.Paths()) + clearEntries(tx.Formatters()) + return nil + }) + as.NoError(err) + + // check the cache is empty + err = cache.Update(func(tx *Tx) error { + checkEmpty(tx.Paths()) + checkEmpty(tx.Formatters()) + return nil + }) + as.NoError(err) +}