Skip to content

Commit

Permalink
feat: improve change detection
Browse files Browse the repository at this point in the history
Previously, we were storing the last `modtime` and `size` for a given path in `boltdb`.
This was used to determine if a file had changed and whether we should format it.

In addition, if a formatter's executable changed (`modtime` or `size`), we would delete
all path entries in the database before processing, thereby forcing each file to be
formatted.

This commit introduces a new approach to change detection, one which takes into account
changes to the underlying file, the formatter's executable, _and_ the formatter's
configuration.

Now, when deciding if we should format a file, we do the following:

- Hash each matching formatter, in sequence, using its `name`, `options`, `priority`
  as well as the `modtime` and `size` of its executable.
  This is pre-computed on a per-pipeline basis.
- We then add the file's `modtime` and `size` to generate a format signature.
  You can think of this signature as a unique representation of what we are about to do
  with the file.
- The format signature is then compared with a cache entry (if available).
- If the signatures match, we have already applied this sequence of formatters, with these
  particular options etc. to this file when it had this `modtime` and `size`, so there is
  no more processing to be done.
- If the signatures do no match, we should format the file, recording the new format
  signature in the cache when we are finished.

This approach is simpler in terms of storage, and has the added benefit of finer grained
change detection versus the brute force cache busting we were doing before.

In terms of performance impact, with the pre-computing of hashes per-pipeline and the
simpler storage schema, there appears to have been no significant impact.
Manual testing with [nixpkgs](https://github.com/nixos/nixpkgs) shows comparable run times
for both hot and cold caches.

> [!NOTE]
> Since this changes the database schema, rather than implementing some form of migration
> logic to remove the old buckets and so on, I decided to upgrade the hash algorithm we
> use when determining the filename for the db file.
>
> Previously, we were using `sha1`, matching the behaviour from `1.0`.
> Now we use `sha256`, which results in a slightly longer db name, but has the benefit of
> ensuring a new db instance will be created on first invocation, as well as making
> [gosec](https://golangci-lint.run/usage/linters/#gosec) happy.

Closes #455

Signed-off-by: Brian McGee <[email protected]>
  • Loading branch information
brianmcgee committed Oct 18, 2024
1 parent ce42c30 commit 185e6e3
Show file tree
Hide file tree
Showing 19 changed files with 766 additions and 796 deletions.
10 changes: 0 additions & 10 deletions cmd/format/format.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,6 @@ func Run(v *viper.Viper, statz *stats.Stats, cmd *cobra.Command, paths []string)
}()
}

// set a prefix on the default logger
log.SetPrefix("format")

var db *bolt.DB

// open the db unless --no-cache was specified
Expand Down Expand Up @@ -161,13 +158,6 @@ func Run(v *viper.Viper, statz *stats.Stats, cmd *cobra.Command, paths []string)
return fmt.Errorf("failed to create composite formatter: %w", err)
}

if db != nil {
// compare formatters with the db, busting the cache if the formatters have changed
if err := formatter.BustCache(db); err != nil {
return fmt.Errorf("failed to compare formatters: %w", err)
}
}

// create a new walker for traversing the paths
walker, err := walk.NewCompositeReader(walkType, cfg.TreeRoot, paths, db, statz)
if err != nil {
Expand Down
53 changes: 42 additions & 11 deletions cmd/root_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ func TestOnUnmatched(t *testing.T) {

checkOutput := func(level string, output []byte) {
for _, p := range paths {
as.Contains(string(output), fmt.Sprintf("%s format: no formatter for path: %s", level, p))
as.Contains(string(output), fmt.Sprintf("%s no formatter for path: %s", level, p))
}
}

Expand Down Expand Up @@ -605,15 +605,15 @@ func TestBustCacheOnFormatterChange(t *testing.T) {
configPath := tempDir + "/touch.toml"

// symlink some formatters into temp dir, so we can mess with their mod times
binPath := tempDir + "/bin"
binPath := filepath.Join(tempDir, "bin")
as.NoError(os.Mkdir(binPath, 0o755))

binaries := []string{"black", "elm-format", "gofmt"}

for _, name := range binaries {
src, err := exec.LookPath(name)
as.NoError(err)
as.NoError(os.Symlink(src, binPath+"/"+name))
as.NoError(os.Symlink(src, filepath.Join(binPath, name)))
}

// prepend our test bin directory to PATH
Expand Down Expand Up @@ -647,15 +647,16 @@ func TestBustCacheOnFormatterChange(t *testing.T) {
})

// tweak mod time of elm formatter
as.NoError(test.RecreateSymlink(t, binPath+"/"+"elm-format"))
newTime := time.Now().Add(-time.Minute)
as.NoError(test.Lutimes(t, filepath.Join(binPath, "elm-format"), newTime, newTime))

_, statz, err = treefmt(t, args...)
as.NoError(err)

assertStats(t, as, statz, map[stats.Type]int{
stats.Traversed: 32,
stats.Matched: 3,
stats.Formatted: 3,
stats.Formatted: 1,
stats.Changed: 0,
})

Expand All @@ -671,15 +672,15 @@ func TestBustCacheOnFormatterChange(t *testing.T) {
})

// tweak mod time of python formatter
as.NoError(test.RecreateSymlink(t, binPath+"/"+"black"))
as.NoError(test.Lutimes(t, filepath.Join(binPath, "black"), newTime, newTime))

_, statz, err = treefmt(t, args...)
as.NoError(err)

assertStats(t, as, statz, map[stats.Type]int{
stats.Traversed: 32,
stats.Matched: 3,
stats.Formatted: 3,
stats.Formatted: 2,
stats.Changed: 0,
})

Expand All @@ -695,11 +696,12 @@ func TestBustCacheOnFormatterChange(t *testing.T) {
})

// add go formatter
cfg.FormatterConfigs["go"] = &config.Formatter{
goFormatter := &config.Formatter{
Command: "gofmt",
Options: []string{"-w"},
Includes: []string{"*.go"},
}
cfg.FormatterConfigs["go"] = goFormatter
test.WriteConfig(t, configPath, cfg)

_, statz, err = treefmt(t, args...)
Expand All @@ -708,7 +710,7 @@ func TestBustCacheOnFormatterChange(t *testing.T) {
assertStats(t, as, statz, map[stats.Type]int{
stats.Traversed: 32,
stats.Matched: 4,
stats.Formatted: 4,
stats.Formatted: 1,
stats.Changed: 0,
})

Expand All @@ -723,6 +725,35 @@ func TestBustCacheOnFormatterChange(t *testing.T) {
stats.Changed: 0,
})

// tweak go formatter options
goFormatter.Options = []string{"-w", "-s"}

test.WriteConfig(t, configPath, cfg)

_, statz, err = treefmt(t, args...)
as.NoError(err)

assertStats(t, as, statz, map[stats.Type]int{
stats.Traversed: 32,
stats.Matched: 4,
stats.Formatted: 1,
stats.Changed: 0,
})

// add a priority
cfg.FormatterConfigs["go"].Priority = 3
test.WriteConfig(t, configPath, cfg)

_, statz, err = treefmt(t, args...)
as.NoError(err)

assertStats(t, as, statz, map[stats.Type]int{
stats.Traversed: 32,
stats.Matched: 4,
stats.Formatted: 1,
stats.Changed: 0,
})

// remove python formatter
delete(cfg.FormatterConfigs, "python")
test.WriteConfig(t, configPath, cfg)
Expand All @@ -733,7 +764,7 @@ func TestBustCacheOnFormatterChange(t *testing.T) {
assertStats(t, as, statz, map[stats.Type]int{
stats.Traversed: 32,
stats.Matched: 2,
stats.Formatted: 2,
stats.Formatted: 0,
stats.Changed: 0,
})

Expand All @@ -758,7 +789,7 @@ func TestBustCacheOnFormatterChange(t *testing.T) {
assertStats(t, as, statz, map[stats.Type]int{
stats.Traversed: 32,
stats.Matched: 1,
stats.Formatted: 1,
stats.Formatted: 0,
stats.Changed: 0,
})

Expand Down
193 changes: 193 additions & 0 deletions format/composite.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
package format

import (
"context"
"crypto/sha256"
"errors"
"fmt"
"os"
"slices"

"github.com/charmbracelet/log"
"github.com/gobwas/glob"
"github.com/numtide/treefmt/config"
"github.com/numtide/treefmt/stats"
"github.com/numtide/treefmt/walk"
"mvdan.cc/sh/v3/expand"
)

const (
batchKeySeparator = ":"
)

var ErrFormattingFailures = errors.New("formatting failures detected")

// CompositeFormatter handles the application of multiple Formatter instances based on global excludes and individual
// formatter configuration.
type CompositeFormatter struct {
cfg *config.Config
stats *stats.Stats
globalExcludes []glob.Glob

unmatchedLevel log.Level

scheduler *scheduler
formatters map[string]*Formatter
}

// match filters the file against global excludes and returns a list of formatters that want to process the file.
func (c *CompositeFormatter) match(file *walk.File) []*Formatter {
// first check if this file has been globally excluded
if pathMatches(file.RelPath, c.globalExcludes) {
log.Debugf("path matched global excludes: %s", file.RelPath)

return nil
}

// a list of formatters that match this file
var matches []*Formatter

// iterate the formatters, recording which are interested in this file
for _, formatter := range c.formatters {
if formatter.Wants(file) {
matches = append(matches, formatter)
}
}

return matches
}

// Apply applies the configured formatters to the given files.
func (c *CompositeFormatter) Apply(ctx context.Context, files []*walk.File) error {
var toRelease []*walk.File

for _, file := range files {
matches := c.match(file) // match the file against the formatters

// check if there were no matches
if len(matches) == 0 {
// log that there was no match, exiting with an error if the unmatched level was set to fatal
if c.unmatchedLevel == log.FatalLevel {
return fmt.Errorf("no formatter for path: %s", file.RelPath)
}

log.Logf(c.unmatchedLevel, "no formatter for path: %s", file.RelPath)

// no further processing to be done, append to the release list
toRelease = append(toRelease, file)

// continue to the next file
continue
}

// record there was a match
c.stats.Add(stats.Matched, 1)

if accepted, err := c.scheduler.submit(ctx, file, matches); err != nil {
return fmt.Errorf("failed to schedule file: %w", err)
} else if !accepted {
// if a file wasn't accepted, it means there was no formatting to perform
toRelease = append(toRelease, file)
}
}

// release files that require no further processing
// we set noCache to true as there's no need to update the cache, since we skipped those files
releaseCtx := walk.SetNoCache(ctx, true)

for _, file := range toRelease {
if err := file.Release(releaseCtx); err != nil {
return fmt.Errorf("failed to release file: %w", err)
}
}

return nil
}

// signature takes anything that might affect the paths to be traversed,
// or how they are traversed, and adds it to a sha256 hash.
// This can be used to determine if there has been a material change in config.
func (c *CompositeFormatter) signature() (signature, error) {
h := sha256.New()

// sort formatters deterministically
formatters := make([]*Formatter, 0, len(c.formatters))
for _, f := range c.formatters {
formatters = append(formatters, f)
}

slices.SortFunc(formatters, formatterSortFunc)

// apply them to the hash
for _, f := range formatters {
if err := f.Hash(h); err != nil {
return nil, fmt.Errorf("failed to hash formatter: %w", err)
}
}

// finalize
return h.Sum(nil), nil
}

// Close finalizes the processing of the CompositeFormatter, ensuring that any remaining batches are applied and
// all formatters have completed their tasks. It returns an error if any formatting failures were detected.
func (c *CompositeFormatter) Close(ctx context.Context) error {
return c.scheduler.close(ctx)
}

func NewCompositeFormatter(
cfg *config.Config,
statz *stats.Stats,
batchSize int,
) (*CompositeFormatter, error) {
// compile global exclude globs
globalExcludes, err := compileGlobs(cfg.Excludes)
if err != nil {
return nil, fmt.Errorf("failed to compile global excludes: %w", err)
}

// parse unmatched log level
unmatchedLevel, err := log.ParseLevel(cfg.OnUnmatched)
if err != nil {
return nil, fmt.Errorf("invalid on-unmatched value: %w", err)
}

// create a composite formatter, adjusting the change logging based on --fail-on-change
changeLevel := log.DebugLevel
if cfg.FailOnChange {
changeLevel = log.ErrorLevel
}

// create formatters
formatters := make(map[string]*Formatter)

env := expand.ListEnviron(os.Environ()...)

for name, formatterCfg := range cfg.FormatterConfigs {
formatter, err := newFormatter(name, cfg.TreeRoot, env, formatterCfg)

if errors.Is(err, ErrCommandNotFound) && cfg.AllowMissingFormatter {
log.Debugf("formatter command not found: %v", name)

continue
} else if err != nil {
return nil, fmt.Errorf("failed to initialise formatter %v: %w", name, err)
}

// store formatter by name
formatters[name] = formatter
}

// create a scheduler for carrying out the actual formatting
scheduler := newScheduler(statz, batchSize, changeLevel, formatters)

return &CompositeFormatter{
cfg: cfg,
stats: statz,
globalExcludes: globalExcludes,
unmatchedLevel: unmatchedLevel,

scheduler: scheduler,
formatters: formatters,
}, nil
}
Loading

0 comments on commit 185e6e3

Please sign in to comment.