Skip to content

Commit

Permalink
[AMDGPU] Add MaxMemoryClauseSchedStrategy (llvm#114957)
Browse files Browse the repository at this point in the history
Also expose an option to choose custom scheduler strategy:
amdgpu-sched-strategy={max-ilp|max-memory-clause}
This can be set through either function attribute or command line option.

The major behaviors of the max memory clause schedule strategy includes:
1. Try to cluster memory instructions more aggressively.
2. Try to schedule long latency load earlier than short latency
   instruction.

I tested locally against about 470 real shaders and got the perf
changes (only count perf changes over +/-10%):
About 15 shaders improved 10%~40%.
Only 3 shaders drops ~10%.

(This was tested together with another change which increases the
maximum clustered dword from 8 to 32).
I will make another change to make that threshold configurable.
  • Loading branch information
ruiling authored Dec 9, 2024
1 parent a4506bb commit b33c807
Show file tree
Hide file tree
Showing 6 changed files with 654 additions and 9 deletions.
35 changes: 30 additions & 5 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -428,10 +428,10 @@ static cl::opt<bool>
cl::desc("Enable loop data prefetch on AMDGPU"),
cl::Hidden, cl::init(false));

static cl::opt<bool> EnableMaxIlpSchedStrategy(
"amdgpu-enable-max-ilp-scheduling-strategy",
cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
cl::Hidden, cl::init(false));
static cl::opt<std::string>
AMDGPUSchedStrategy("amdgpu-sched-strategy",
cl::desc("Select custom AMDGPU scheduling strategy."),
cl::Hidden, cl::init(""));

static cl::opt<bool> EnableRewritePartialRegUses(
"amdgpu-enable-rewrite-partial-reg-uses",
Expand Down Expand Up @@ -567,6 +567,18 @@ createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
return DAG;
}

static ScheduleDAGInstrs *
createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) {
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(
C, std::make_unique<GCNMaxMemoryClauseSchedStrategy>(C));
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
return DAG;
}

static ScheduleDAGInstrs *
createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
Expand Down Expand Up @@ -607,6 +619,10 @@ static MachineSchedRegistry
GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
createGCNMaxILPMachineScheduler);

static MachineSchedRegistry GCNMaxMemoryClauseSchedRegistry(
"gcn-max-memory-clause", "Run GCN scheduler to maximize memory clause",
createGCNMaxMemoryClauseMachineScheduler);

static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry(
"gcn-iterative-max-occupancy-experimental",
"Run GCN scheduler to maximize occupancy (experimental)",
Expand Down Expand Up @@ -1294,9 +1310,18 @@ ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
if (ST.enableSIScheduler())
return createSIMachineScheduler(C);

if (EnableMaxIlpSchedStrategy)
Attribute SchedStrategyAttr =
C->MF->getFunction().getFnAttribute("amdgpu-sched-strategy");
StringRef SchedStrategy = SchedStrategyAttr.isValid()
? SchedStrategyAttr.getValueAsString()
: AMDGPUSchedStrategy;

if (SchedStrategy == "max-ilp")
return createGCNMaxILPMachineScheduler(C);

if (SchedStrategy == "max-memory-clause")
return createGCNMaxMemoryClauseMachineScheduler(C);

return createGCNMaxOccupancyMachineScheduler(C);
}

Expand Down
146 changes: 145 additions & 1 deletion llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -615,6 +615,138 @@ bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand,
return false;
}

GCNMaxMemoryClauseSchedStrategy::GCNMaxMemoryClauseSchedStrategy(
const MachineSchedContext *C)
: GCNSchedStrategy(C) {
SchedStages.push_back(GCNSchedStageID::MemoryClauseInitialSchedule);
}

/// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as
/// much as possible. This is achieved by:
// 1. Prioritize clustered operations before stall latency heuristic.
// 2. Prioritize long-latency-load before stall latency heuristic.
///
/// \param Cand provides the policy and current best candidate.
/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
/// \param Zone describes the scheduled zone that we are extending, or nullptr
/// if Cand is from a different zone than TryCand.
/// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand,
SchedCandidate &TryCand,
SchedBoundary *Zone) const {
// Initialize the candidate if needed.
if (!Cand.isValid()) {
TryCand.Reason = NodeOrder;
return true;
}

// Bias PhysReg Defs and copies to their uses and defined respectively.
if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
return TryCand.Reason != NoCand;

if (DAG->isTrackingPressure()) {
// Avoid exceeding the target's limit.
if (tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
RegExcess, TRI, DAG->MF))
return TryCand.Reason != NoCand;

// Avoid increasing the max critical pressure in the scheduled region.
if (tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
TryCand, Cand, RegCritical, TRI, DAG->MF))
return TryCand.Reason != NoCand;
}

// MaxMemoryClause-specific: We prioritize clustered instructions as we would
// get more benefit from clausing these memory instructions.
const SUnit *CandNextClusterSU =
Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
const SUnit *TryCandNextClusterSU =
TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
if (tryGreater(TryCand.SU == TryCandNextClusterSU,
Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
return TryCand.Reason != NoCand;

// We only compare a subset of features when comparing nodes between
// Top and Bottom boundary. Some properties are simply incomparable, in many
// other instances we should only override the other boundary if something
// is a clear good pick on one boundary. Skip heuristics that are more
// "tie-breaking" in nature.
bool SameBoundary = Zone != nullptr;
if (SameBoundary) {
// For loops that are acyclic path limited, aggressively schedule for
// latency. Within an single cycle, whenever CurrMOps > 0, allow normal
// heuristics to take precedence.
if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
tryLatency(TryCand, Cand, *Zone))
return TryCand.Reason != NoCand;

// MaxMemoryClause-specific: Prioritize long latency memory load
// instructions in top-bottom order to hide more latency. The mayLoad check
// is used to exclude store-like instructions, which we do not want to
// scheduler them too early.
bool TryMayLoad =
TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad();
bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad();

if (TryMayLoad || CandMayLoad) {
bool TryLongLatency =
TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad;
bool CandLongLatency =
10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad;

if (tryGreater(Zone->isTop() ? TryLongLatency : CandLongLatency,
Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand,
Cand, Stall))
return TryCand.Reason != NoCand;
}
// Prioritize instructions that read unbuffered resources by stall cycles.
if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
return TryCand.Reason != NoCand;
}

if (SameBoundary) {
// Weak edges are for clustering and other constraints.
if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
return TryCand.Reason != NoCand;
}

// Avoid increasing the max pressure of the entire region.
if (DAG->isTrackingPressure() &&
tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
Cand, RegMax, TRI, DAG->MF))
return TryCand.Reason != NoCand;

if (SameBoundary) {
// Avoid critical resource consumption and balance the schedule.
TryCand.initResourceDelta(DAG, SchedModel);
if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
TryCand, Cand, ResourceReduce))
return TryCand.Reason != NoCand;
if (tryGreater(TryCand.ResDelta.DemandedResources,
Cand.ResDelta.DemandedResources, TryCand, Cand,
ResourceDemand))
return TryCand.Reason != NoCand;

// Avoid serializing long latency dependence chains.
// For acyclic path limited loops, latency was already checked above.
if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
!Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
return TryCand.Reason != NoCand;

// Fall through to original instruction order.
if (Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum)) {
assert(TryCand.SU->NodeNum != Cand.SU->NodeNum);
TryCand.Reason = NodeOrder;
return true;
}
}

return false;
}

GCNScheduleDAGMILive::GCNScheduleDAGMILive(
MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
: ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
Expand Down Expand Up @@ -644,6 +776,9 @@ GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
return std::make_unique<PreRARematStage>(SchedStageID, *this);
case GCNSchedStageID::ILPInitialSchedule:
return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this);
case GCNSchedStageID::MemoryClauseInitialSchedule:
return std::make_unique<MemoryClauseInitialScheduleStage>(SchedStageID,
*this);
}

llvm_unreachable("Unknown SchedStageID.");
Expand Down Expand Up @@ -869,6 +1004,9 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
case GCNSchedStageID::ILPInitialSchedule:
OS << "Max ILP Initial Schedule";
break;
case GCNSchedStageID::MemoryClauseInitialSchedule:
OS << "Max memory clause Initial Schedule";
break;
}

return OS;
Expand Down Expand Up @@ -1088,7 +1226,8 @@ void GCNSchedStage::setupNewBlock() {
// Get real RP for the region if it hasn't be calculated before. After the
// initial schedule stage real RP will be collected after scheduling.
if (StageID == GCNSchedStageID::OccInitialSchedule ||
StageID == GCNSchedStageID::ILPInitialSchedule)
StageID == GCNSchedStageID::ILPInitialSchedule ||
StageID == GCNSchedStageID::MemoryClauseInitialSchedule)
DAG.computeBlockPressure(RegionIdx, CurrentMBB);
}

Expand Down Expand Up @@ -1389,6 +1528,11 @@ bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
return false;
}

bool MemoryClauseInitialScheduleStage::shouldRevertScheduling(
unsigned WavesAfter) {
return mayCauseSpilling(WavesAfter);
}

bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
!PressureAfter.less(MF, PressureBefore)) {
Expand Down
23 changes: 22 additions & 1 deletion llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ enum class GCNSchedStageID : unsigned {
UnclusteredHighRPReschedule = 1,
ClusteredLowOccupancyReschedule = 2,
PreRARematerialize = 3,
ILPInitialSchedule = 4
ILPInitialSchedule = 4,
MemoryClauseInitialSchedule = 5
};

#ifndef NDEBUG
Expand Down Expand Up @@ -149,6 +150,17 @@ class GCNMaxILPSchedStrategy final : public GCNSchedStrategy {
GCNMaxILPSchedStrategy(const MachineSchedContext *C);
};

/// The goal of this scheduling strategy is to maximize memory clause for a
/// single wave.
class GCNMaxMemoryClauseSchedStrategy final : public GCNSchedStrategy {
protected:
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
SchedBoundary *Zone) const override;

public:
GCNMaxMemoryClauseSchedStrategy(const MachineSchedContext *C);
};

class ScheduleMetrics {
unsigned ScheduleLength;
unsigned BubbleCycles;
Expand Down Expand Up @@ -463,6 +475,15 @@ class ILPInitialScheduleStage : public GCNSchedStage {
: GCNSchedStage(StageID, DAG) {}
};

class MemoryClauseInitialScheduleStage : public GCNSchedStage {
public:
bool shouldRevertScheduling(unsigned WavesAfter) override;

MemoryClauseInitialScheduleStage(GCNSchedStageID StageID,
GCNScheduleDAGMILive &DAG)
: GCNSchedStage(StageID, DAG) {}
};

class GCNPostScheduleDAGMILive final : public ScheduleDAGMI {
private:
std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;
Expand Down
Loading

0 comments on commit b33c807

Please sign in to comment.