Merge cockroachdb#123331

123331: internal/sqlsmith: make it more deterministic r=yuzefovich a=yuzefovich This commit removes one source of non-determinism (iteration order over a map) in sqlsmith. The following parts are affected: - list of regions for MR setup - file names for IMPORT stmt - table-index pairs for merge join expr - DROP FUNCTION - picking a random table or random index - UDTs - builtin functions and binary operators (here we needed to make construction of global "registries" deterministic). Informs: cockroachdb#122766 Epic: None Release note: None Co-authored-by: Yahor Yuzefovich <[email protected]>
lyang24 · May 1, 2024 · 186b9b9 · 186b9b9
2 parents e572349 + d30a455
commit 186b9b9
Show file tree

Hide file tree

Showing 6 changed files with 130 additions and 46 deletions.
diff --git a/pkg/internal/sqlsmith/alter.go b/pkg/internal/sqlsmith/alter.go
@@ -12,6 +12,7 @@ package sqlsmith
 
 import (
 	gosql "database/sql"
+	"sort"
 
 	"github.com/cockroachdb/cockroach/pkg/sql/catalog/colinfo"
 	"github.com/cockroachdb/cockroach/pkg/sql/randgen"
@@ -425,7 +426,7 @@ func makeDropType(s *Smither) (tree.Statement, bool) {
 	var typNames []*tree.UnresolvedObjectName
 	for len(typNames) < 1 || s.coin() {
 		// It's ok if the same type is chosen multiple times.
-		typName, ok := s.getRandUserDefinedType()
+		_, typName, ok := s.getRandUserDefinedType()
 		if !ok {
 			if len(typNames) == 0 {
 				return nil, false
@@ -458,6 +459,9 @@ func rowsToRegionList(rows *gosql.Rows) ([]string, error) {
 	for region := range regionsSet {
 		regions = append(regions, region)
 	}
+	// Make deterministic. Note that we don't need to shuffle the regions since
+	// the caller will be picking random ones from the slice.
+	sort.Strings(regions)
 	return regions, nil
 }
 

diff --git a/pkg/internal/sqlsmith/bulkio.go b/pkg/internal/sqlsmith/bulkio.go
@@ -16,6 +16,7 @@ import (
 	"net/http"
 	"net/http/httptest"
 	"regexp"
+	"sort"
 	"strings"
 	"time"
 
@@ -125,6 +126,7 @@ func makeRestore(s *Smither) (tree.Statement, bool) {
 	func() {
 		s.lock.Lock()
 		defer s.lock.Unlock()
+		// TODO(yuzefovich): picking a backup target here is non-deterministic.
 		for name, targets = range s.bulkBackups {
 			break
 		}
@@ -219,12 +221,17 @@ func makeImport(s *Smither) (tree.Statement, bool) {
 		}
 		expr := s.bulkExports[0]
 		s.bulkExports = s.bulkExports[1:]
-		var f tree.Exprs
+		var fileNames []string
 		for name := range s.bulkFiles {
 			if strings.Contains(name, expr+"/") && !strings.HasSuffix(name, exportSchema) {
-				f = append(f, tree.NewStrVal(s.bulkSrv.URL+name))
+				fileNames = append(fileNames, name)
 			}
 		}
+		sort.Strings(fileNames)
+		var f tree.Exprs
+		for _, name := range fileNames {
+			f = append(f, tree.NewStrVal(s.bulkSrv.URL+name))
+		}
 		return f, expr
 	}()
 	// An empty table will produce an EXPORT with zero files.

diff --git a/pkg/internal/sqlsmith/relational.go b/pkg/internal/sqlsmith/relational.go
@@ -12,6 +12,7 @@ package sqlsmith
 
 import (
 	"fmt"
+	"sort"
 	"strings"
 
 	"github.com/cockroachdb/cockroach/pkg/sql/randgen"
@@ -356,16 +357,31 @@ func makeMergeJoinExpr(s *Smither, _ colRefs, forJoin bool) (tree.TableExpr, col
 	rightAliasName := tree.NewUnqualifiedTableName(rightAlias)
 
 	// Now look for one that satisfies our constraints (some shared prefix
-	// of type + direction), might end up being the same one. We rely on
-	// Go's non-deterministic map iteration ordering for randomness.
+	// of type + direction), might end up being the same one.
 	rightTableName, cols, ok := func() (*tree.TableIndexName, [][2]colRef, bool) {
 		s.lock.RLock()
 		defer s.lock.RUnlock()
-		for tbl, idxs := range s.indexes {
-			for idxName, idx := range idxs {
+		if len(s.tables) == 0 {
+			return nil, nil, false
+		}
+		// Iterate in deterministic but random order over all tables.
+		tableNames := make([]tree.TableName, 0, len(s.tables))
+		for t := range s.indexes {
+			tableNames = append(tableNames, t)
+		}
+		sort.Slice(tableNames, func(i, j int) bool {
+			return strings.Compare(tableNames[i].String(), tableNames[j].String()) < 0
+		})
+		s.rnd.Shuffle(len(tableNames), func(i, j int) {
+			tableNames[i], tableNames[j] = tableNames[j], tableNames[i]
+		})
+		for _, tbl := range tableNames {
+			// Iterate in deterministic but random order over all indexes.
+			idxs := s.getAllIndexesForTableRLocked(tbl)
+			for _, idx := range idxs {
 				rightTableName := &tree.TableIndexName{
 					Table: tbl,
-					Index: tree.UnrestrictedName(idxName),
+					Index: tree.UnrestrictedName(idx.Name),
 				}
 				// cols keeps track of matching column pairs.
 				var cols [][2]colRef
@@ -919,6 +935,10 @@ func makeDropFunc(s *Smither) (tree.Statement, bool) {
 	for oid := range fns {
 		retOIDs = append(retOIDs, oid)
 	}
+	// Sort the slice since iterating over fns is non-deterministic.
+	sort.Slice(retOIDs, func(i, j int) bool {
+		return retOIDs[i] < retOIDs[j]
+	})
 	// Pick a random starting point within the list of OIDs.
 	oidShift := s.rnd.Intn(len(retOIDs))
 	for i := 0; i < len(retOIDs); i++ {

diff --git a/pkg/internal/sqlsmith/schema.go b/pkg/internal/sqlsmith/schema.go
@@ -12,6 +12,7 @@ package sqlsmith
 
 import (
 	"fmt"
+	"sort"
 	"strings"
 
 	"github.com/cockroachdb/cockroach/pkg/sql/catalog/colinfo"
@@ -87,16 +88,56 @@ func (s *Smither) ReloadSchemas() error {
 	return err
 }
 
+// indexesWithNames is a helper struct to sort CreateIndex nodes based on the
+// names.
+type indexesWithNames struct {
+	names []string
+	nodes []*tree.CreateIndex
+}
+
+func (t *indexesWithNames) Len() int {
+	return len(t.names)
+}
+
+func (t *indexesWithNames) Less(i, j int) bool {
+	return t.names[i] < t.names[j]
+}
+
+func (t *indexesWithNames) Swap(i, j int) {
+	t.names[i], t.names[j] = t.names[j], t.names[i]
+	t.nodes[i], t.nodes[j] = t.nodes[j], t.nodes[i]
+}
+
+var _ sort.Interface = &indexesWithNames{}
+
+// getAllIndexesForTableRLocked returns information about all indexes of the
+// given table in the deterministic order. s.lock is assumed to be read-locked.
+func (s *Smither) getAllIndexesForTableRLocked(tableName tree.TableName) []*tree.CreateIndex {
+	s.lock.AssertRHeld()
+	indexes, ok := s.indexes[tableName]
+	if !ok {
+		return nil
+	}
+	names := make([]string, 0, len(indexes))
+	nodes := make([]*tree.CreateIndex, 0, len(indexes))
+	for _, index := range indexes {
+		names = append(names, string(index.Name))
+		nodes = append(nodes, index)
+	}
+	sort.Sort(&indexesWithNames{names: names, nodes: nodes})
+	return nodes
+}
+
 func (s *Smither) getRandTable() (*aliasedTableRef, bool) {
 	s.lock.RLock()
 	defer s.lock.RUnlock()
 	if len(s.tables) == 0 {
 		return nil, false
 	}
 	table := s.tables[s.rnd.Intn(len(s.tables))]
-	indexes := s.indexes[*table.TableName]
 	var indexFlags tree.IndexFlags
 	if !s.disableIndexHints && s.coin() {
+		indexes := s.getAllIndexesForTableRLocked(*table.TableName)
 		indexNames := make([]tree.Name, 0, len(indexes))
 		for _, index := range indexes {
 			if !index.Inverted {
@@ -117,17 +158,16 @@ func (s *Smither) getRandTable() (*aliasedTableRef, bool) {
 func (s *Smither) getRandTableIndex(
 	table, alias tree.TableName,
 ) (*tree.TableIndexName, *tree.CreateIndex, colRefs, bool) {
-	s.lock.RLock()
-	indexes := s.indexes[table]
-	s.lock.RUnlock()
+	var indexes []*tree.CreateIndex
+	func() {
+		s.lock.RLock()
+		defer s.lock.RUnlock()
+		indexes = s.getAllIndexesForTableRLocked(table)
+	}()
 	if len(indexes) == 0 {
 		return nil, nil, nil, false
 	}
-	names := make([]tree.Name, 0, len(indexes))
-	for n := range indexes {
-		names = append(names, n)
-	}
-	idx := indexes[names[s.rnd.Intn(len(names))]]
+	idx := indexes[s.rnd.Intn(len(indexes))]
 	var refs colRefs
 	s.lock.RLock()
 	defer s.lock.RUnlock()
@@ -159,13 +199,12 @@ func (s *Smither) getRandIndex() (*tree.TableIndexName, *tree.CreateIndex, colRe
 }
 
 func (s *Smither) getRandUserDefinedTypeLabel() (*tree.EnumValue, *tree.TypeName, bool) {
-	typName, ok := s.getRandUserDefinedType()
+	udt, typName, ok := s.getRandUserDefinedType()
 	if !ok {
 		return nil, nil, false
 	}
 	s.lock.RLock()
 	defer s.lock.RUnlock()
-	udt := s.types.udts[*typName]
 	logicalRepresentations := udt.TypeMeta.EnumData.LogicalRepresentations
 	// There are no values in this enum.
 	if len(logicalRepresentations) == 0 {
@@ -175,21 +214,14 @@ func (s *Smither) getRandUserDefinedTypeLabel() (*tree.EnumValue, *tree.TypeName
 	return &enumVal, typName, true
 }
 
-func (s *Smither) getRandUserDefinedType() (*tree.TypeName, bool) {
+func (s *Smither) getRandUserDefinedType() (*types.T, *tree.TypeName, bool) {
 	s.lock.RLock()
 	defer s.lock.RUnlock()
 	if s.types == nil || len(s.types.udts) == 0 {
-		return nil, false
+		return nil, nil, false
 	}
 	idx := s.rnd.Intn(len(s.types.udts))
-	count := 0
-	for typName := range s.types.udts {
-		if count == idx {
-			return &typName, true
-		}
-		count++
-	}
-	return nil, false
+	return s.types.udts[idx], &s.types.udtNames[idx], true
 }
 
 func (s *Smither) extractTypes() (*typeInfo, error) {
@@ -205,7 +237,8 @@ FROM
 	defer rows.Close()
 
 	evalCtx := eval.Context{}
-	udtMapping := make(map[tree.TypeName]*types.T)
+	var udts []*types.T
+	var udtNames []tree.TypeName
 
 	for rows.Next() {
 		// For each row, collect columns.
@@ -243,18 +276,15 @@ FROM
 				IsMemberReadOnly:        make([]bool, len(members)),
 			},
 		}
-		key := tree.MakeSchemaQualifiedTypeName(scName, name)
-		udtMapping[key] = typ
-	}
-	var udts []*types.T
-	for _, t := range udtMapping {
-		udts = append(udts, t)
+		udts = append(udts, typ)
+		udtNames = append(udtNames, tree.MakeSchemaQualifiedTypeName(scName, name))
 	}
 	// Make sure that future appends to udts force a copy.
 	udts = udts[:len(udts):len(udts)]
 
 	return &typeInfo{
-		udts:        udtMapping,
+		udts:        udts,
+		udtNames:    udtNames,
 		scalarTypes: append(udts, types.Scalar...),
 		seedTypes:   append(udts, randgen.SeedTypes...),
 	}, nil
@@ -454,6 +484,9 @@ func (s *Smither) extractIndexes(
 		// Remove indexes with empty Columns. This is the case for rowid indexes
 		// where the only index column, rowid, is ignored in the SQL statement
 		// above, but the stored columns are not.
+		//
+		// Note that here non-deterministic iteration order over 'indexes' map
+		// doesn't matter.
 		for name, idx := range indexes {
 			if len(idx.Columns) == 0 {
 				delete(indexes, name)
@@ -492,12 +525,21 @@ type operator struct {
 }
 
 var operators = func() map[oid.Oid][]operator {
+	// Ensure deterministic order of populating operators map.
+	binOps := make([]treebin.BinaryOperatorSymbol, 0, len(tree.BinOps))
+	for op := range tree.BinOps {
+		binOps = append(binOps, op)
+	}
+	sort.Slice(binOps, func(i, j int) bool {
+		return binOps[i] < binOps[j]
+	})
 	m := map[oid.Oid][]operator{}
-	for BinaryOperator, overload := range tree.BinOps {
+	for _, binOp := range binOps {
+		overload := tree.BinOps[binOp]
 		_ = overload.ForEachBinOp(func(bo *tree.BinOp) error {
 			m[bo.ReturnType.Oid()] = append(m[bo.ReturnType.Oid()], operator{
 				BinOp:    bo,
-				Operator: treebin.MakeBinaryOperator(BinaryOperator),
+				Operator: treebin.MakeBinaryOperator(binOp),
 			})
 			return nil
 		})
@@ -518,8 +560,15 @@ type functionsMu struct {
 }
 
 var functions = func() *functionsMu {
+	// Ensure deterministic order of populating functions map.
+	funcNames := make([]string, 0, len(tree.FunDefs))
+	for name := range tree.FunDefs {
+		funcNames = append(funcNames, name)
+	}
+	sort.Strings(funcNames)
 	m := map[tree.FunctionClass]map[oid.Oid][]function{}
-	for _, def := range tree.FunDefs {
+	for _, name := range funcNames {
+		def := tree.FunDefs[name]
 		if n := tree.Name(def.Name); n.String() != def.Name {
 			// sqlsmith doesn't know how to quote function names, e.g. for
 			// the numeric cast, we need to use `"numeric"(val)`, but sqlsmith

diff --git a/pkg/internal/sqlsmith/sqlsmith.go b/pkg/internal/sqlsmith/sqlsmith.go
@@ -70,7 +70,9 @@ type Smither struct {
 	tables    []*tableRef
 	sequences []*sequenceRef
 	columns   map[tree.TableName]map[tree.Name]*tree.ColumnTableDef
-	indexes   map[tree.TableName]map[tree.Name]*tree.CreateIndex
+	// Note: consider using getAllIndexesForTable helper if you need to iterate
+	// over all indexes for a particular table.
+	indexes map[tree.TableName]map[tree.Name]*tree.CreateIndex
 	// Only one of nameCounts and nameGens will be used. nameCounts is used when
 	// simpleNames is true.
 	nameCounts       map[string]int

diff --git a/pkg/internal/sqlsmith/type.go b/pkg/internal/sqlsmith/type.go
@@ -127,7 +127,8 @@ func (s *Smither) makeDesiredTypes() []*types.T {
 }
 
 type typeInfo struct {
-	udts        map[tree.TypeName]*types.T
+	udts        []*types.T
+	udtNames    []tree.TypeName
 	seedTypes   []*types.T
 	scalarTypes []*types.T
 }
@@ -137,11 +138,12 @@ func (s *Smither) ResolveType(
 	_ context.Context, name *tree.UnresolvedObjectName,
 ) (*types.T, error) {
 	key := tree.MakeSchemaQualifiedTypeName(name.Schema(), name.Object())
-	res, ok := s.types.udts[key]
-	if !ok {
-		return nil, errors.Newf("type name %s not found by smither", name.Object())
+	for i, typeName := range s.types.udtNames {
+		if typeName == key {
+			return s.types.udts[i], nil
+		}
 	}
-	return res, nil
+	return nil, errors.Newf("type name %s not found by smither", name.Object())
 }
 
 // ResolveTypeByOID implements the tree.TypeReferenceResolver interface.