Merge pull request #9738 from filecoin-project/feat/sched-cycle-metrics
feat: sched: Add metrics around sched cycle
This commit is contained in:
commit
d82b2a5804
@ -14,7 +14,7 @@ import (
|
||||
)
|
||||
|
||||
// Distribution
|
||||
var defaultMillisecondsDistribution = view.Distribution(0.01, 0.05, 0.1, 0.3, 0.6, 0.8, 1, 2, 3, 4, 5, 6, 8, 10, 13, 16, 20, 25, 30, 40, 50, 65, 80, 100, 130, 160, 200, 250, 300, 400, 500, 650, 800, 1000, 2000, 3000, 4000, 5000, 7500, 10000, 20000, 50000, 100000)
|
||||
var defaultMillisecondsDistribution = view.Distribution(0.01, 0.05, 0.1, 0.3, 0.6, 0.8, 1, 2, 3, 4, 5, 6, 8, 10, 13, 16, 20, 25, 30, 40, 50, 65, 80, 100, 130, 160, 200, 250, 300, 400, 500, 650, 800, 1000, 2000, 3000, 4000, 5000, 7500, 10000, 20000, 50000, 100_000, 250_000, 500_000, 1000_000)
|
||||
var workMillisecondsDistribution = view.Distribution(
|
||||
250, 500, 1000, 2000, 5000, 10_000, 30_000, 60_000, 2*60_000, 5*60_000, 10*60_000, 15*60_000, 30*60_000, // short sealing tasks
|
||||
40*60_000, 45*60_000, 50*60_000, 55*60_000, 60*60_000, 65*60_000, 70*60_000, 75*60_000, 80*60_000, 85*60_000, 100*60_000, 120*60_000, // PC2 / C2 range
|
||||
@ -22,6 +22,8 @@ var workMillisecondsDistribution = view.Distribution(
|
||||
350*60_000, 400*60_000, 600*60_000, 800*60_000, 1000*60_000, 1300*60_000, 1800*60_000, 4000*60_000, 10000*60_000, // intel PC1 range
|
||||
)
|
||||
|
||||
var queueSizeDistribution = view.Distribution(0, 1, 2, 3, 5, 7, 10, 15, 25, 35, 50, 70, 90, 130, 200, 300, 500, 1000, 2000, 5000, 10000)
|
||||
|
||||
// Global Tags
|
||||
var (
|
||||
// common
|
||||
@ -136,6 +138,13 @@ var (
|
||||
StorageLimitUsedBytes = stats.Int64("storage/path_limit_used_bytes", "used optional storage limit bytes", stats.UnitBytes)
|
||||
StorageLimitMaxBytes = stats.Int64("storage/path_limit_max_bytes", "optional storage limit", stats.UnitBytes)
|
||||
|
||||
SchedAssignerCycleDuration = stats.Float64("sched/assigner_cycle_ms", "Duration of scheduler assigner cycle", stats.UnitMilliseconds)
|
||||
SchedAssignerCandidatesDuration = stats.Float64("sched/assigner_cycle_candidates_ms", "Duration of scheduler assigner candidate matching step", stats.UnitMilliseconds)
|
||||
SchedAssignerWindowSelectionDuration = stats.Float64("sched/assigner_cycle_window_select_ms", "Duration of scheduler window selection step", stats.UnitMilliseconds)
|
||||
SchedAssignerSubmitDuration = stats.Float64("sched/assigner_cycle_submit_ms", "Duration of scheduler window submit step", stats.UnitMilliseconds)
|
||||
SchedCycleOpenWindows = stats.Int64("sched/assigner_cycle_open_window", "Number of open windows in scheduling cycles", stats.UnitDimensionless)
|
||||
SchedCycleQueueSize = stats.Int64("sched/assigner_cycle_task_queue_entry", "Number of task queue entries in scheduling cycles", stats.UnitDimensionless)
|
||||
|
||||
DagStorePRInitCount = stats.Int64("dagstore/pr_init_count", "PieceReader init count", stats.UnitDimensionless)
|
||||
DagStorePRBytesRequested = stats.Int64("dagstore/pr_requested_bytes", "PieceReader requested bytes", stats.UnitBytes)
|
||||
DagStorePRBytesDiscarded = stats.Int64("dagstore/pr_discarded_bytes", "PieceReader discarded bytes", stats.UnitBytes)
|
||||
@ -428,6 +437,31 @@ var (
|
||||
TagKeys: []tag.Key{StorageID},
|
||||
}
|
||||
|
||||
SchedAssignerCycleDurationView = &view.View{
|
||||
Measure: SchedAssignerCycleDuration,
|
||||
Aggregation: defaultMillisecondsDistribution,
|
||||
}
|
||||
SchedAssignerCandidatesDurationView = &view.View{
|
||||
Measure: SchedAssignerCandidatesDuration,
|
||||
Aggregation: defaultMillisecondsDistribution,
|
||||
}
|
||||
SchedAssignerWindowSelectionDurationView = &view.View{
|
||||
Measure: SchedAssignerWindowSelectionDuration,
|
||||
Aggregation: defaultMillisecondsDistribution,
|
||||
}
|
||||
SchedAssignerSubmitDurationView = &view.View{
|
||||
Measure: SchedAssignerSubmitDuration,
|
||||
Aggregation: defaultMillisecondsDistribution,
|
||||
}
|
||||
SchedCycleOpenWindowsView = &view.View{
|
||||
Measure: SchedCycleOpenWindows,
|
||||
Aggregation: queueSizeDistribution,
|
||||
}
|
||||
SchedCycleQueueSizeView = &view.View{
|
||||
Measure: SchedCycleQueueSize,
|
||||
Aggregation: queueSizeDistribution,
|
||||
}
|
||||
|
||||
DagStorePRInitCountView = &view.View{
|
||||
Measure: DagStorePRInitCount,
|
||||
Aggregation: view.Count(),
|
||||
@ -697,6 +731,7 @@ var MinerNodeViews = append([]*view.View{
|
||||
WorkerCallsReturnedCountView,
|
||||
WorkerUntrackedCallsReturnedView,
|
||||
WorkerCallsReturnedDurationView,
|
||||
|
||||
SectorStatesView,
|
||||
StorageFSAvailableView,
|
||||
StorageAvailableView,
|
||||
@ -708,6 +743,14 @@ var MinerNodeViews = append([]*view.View{
|
||||
StorageReservedBytesView,
|
||||
StorageLimitUsedBytesView,
|
||||
StorageLimitMaxBytesView,
|
||||
|
||||
SchedAssignerCycleDurationView,
|
||||
SchedAssignerCandidatesDurationView,
|
||||
SchedAssignerWindowSelectionDurationView,
|
||||
SchedAssignerSubmitDurationView,
|
||||
SchedCycleOpenWindowsView,
|
||||
SchedCycleQueueSizeView,
|
||||
|
||||
DagStorePRInitCountView,
|
||||
DagStorePRBytesRequestedView,
|
||||
DagStorePRBytesDiscardedView,
|
||||
|
@ -105,7 +105,7 @@ func New(ctx context.Context, lstor *paths.Local, stor paths.Store, ls paths.Loc
|
||||
return nil, xerrors.Errorf("creating prover instance: %w", err)
|
||||
}
|
||||
|
||||
sh, err := newScheduler(sc.Assigner)
|
||||
sh, err := newScheduler(ctx, sc.Assigner)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
@ -110,7 +110,7 @@ func newTestMgr(ctx context.Context, t *testing.T, ds datastore.Datastore) (*Man
|
||||
|
||||
stor := paths.NewRemote(lstor, si, nil, 6000, &paths.DefaultPartialFileHandler{})
|
||||
|
||||
sh, err := newScheduler("")
|
||||
sh, err := newScheduler(ctx, "")
|
||||
require.NoError(t, err)
|
||||
|
||||
m := &Manager{
|
||||
|
@ -10,6 +10,7 @@ import (
|
||||
|
||||
"github.com/filecoin-project/go-state-types/abi"
|
||||
|
||||
"github.com/filecoin-project/lotus/metrics"
|
||||
"github.com/filecoin-project/lotus/storage/sealer/sealtasks"
|
||||
"github.com/filecoin-project/lotus/storage/sealer/storiface"
|
||||
)
|
||||
@ -51,6 +52,8 @@ type WorkerSelector interface {
|
||||
}
|
||||
|
||||
type Scheduler struct {
|
||||
mctx context.Context // metrics context
|
||||
|
||||
assigner Assigner
|
||||
|
||||
workersLk sync.RWMutex
|
||||
@ -146,7 +149,7 @@ type rmRequest struct {
|
||||
res chan error
|
||||
}
|
||||
|
||||
func newScheduler(assigner string) (*Scheduler, error) {
|
||||
func newScheduler(ctx context.Context, assigner string) (*Scheduler, error) {
|
||||
var a Assigner
|
||||
switch assigner {
|
||||
case "", "utilization":
|
||||
@ -158,6 +161,7 @@ func newScheduler(assigner string) (*Scheduler, error) {
|
||||
}
|
||||
|
||||
return &Scheduler{
|
||||
mctx: ctx,
|
||||
assigner: a,
|
||||
|
||||
Workers: map[storiface.WorkerID]*WorkerHandle{},
|
||||
@ -366,6 +370,9 @@ func (sh *Scheduler) trySched() {
|
||||
sh.workersLk.RLock()
|
||||
defer sh.workersLk.RUnlock()
|
||||
|
||||
done := metrics.Timer(sh.mctx, metrics.SchedAssignerCycleDuration)
|
||||
defer done()
|
||||
|
||||
sh.assigner.TrySched(sh)
|
||||
}
|
||||
|
||||
|
@ -5,6 +5,10 @@ import (
|
||||
"math/rand"
|
||||
"sort"
|
||||
"sync"
|
||||
|
||||
"go.opencensus.io/stats"
|
||||
|
||||
"github.com/filecoin-project/lotus/metrics"
|
||||
)
|
||||
|
||||
type WindowSelector func(sh *Scheduler, queueLen int, acceptableWindows [][]int, windows []SchedWindow) int
|
||||
@ -36,6 +40,9 @@ func (a *AssignerCommon) TrySched(sh *Scheduler) {
|
||||
windowsLen := len(sh.OpenWindows)
|
||||
queueLen := sh.SchedQueue.Len()
|
||||
|
||||
stats.Record(sh.mctx, metrics.SchedCycleOpenWindows.M(int64(windowsLen)))
|
||||
stats.Record(sh.mctx, metrics.SchedCycleQueueSize.M(int64(queueLen)))
|
||||
|
||||
log.Debugf("SCHED %d queued; %d open windows", queueLen, windowsLen)
|
||||
|
||||
if windowsLen == 0 || queueLen == 0 {
|
||||
@ -52,6 +59,11 @@ func (a *AssignerCommon) TrySched(sh *Scheduler) {
|
||||
// Step 1
|
||||
throttle := make(chan struct{}, windowsLen)
|
||||
|
||||
partDone := metrics.Timer(sh.mctx, metrics.SchedAssignerCandidatesDuration)
|
||||
defer func() {
|
||||
partDone()
|
||||
}()
|
||||
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(queueLen)
|
||||
for i := 0; i < queueLen; i++ {
|
||||
@ -151,9 +163,14 @@ func (a *AssignerCommon) TrySched(sh *Scheduler) {
|
||||
log.Debugf("SCHED Acceptable win: %+v", acceptableWindows)
|
||||
|
||||
// Step 2
|
||||
partDone()
|
||||
partDone = metrics.Timer(sh.mctx, metrics.SchedAssignerWindowSelectionDuration)
|
||||
|
||||
scheduled := a.WindowSel(sh, queueLen, acceptableWindows, windows)
|
||||
|
||||
// Step 3
|
||||
partDone()
|
||||
partDone = metrics.Timer(sh.mctx, metrics.SchedAssignerSubmitDuration)
|
||||
|
||||
if scheduled == 0 {
|
||||
return
|
||||
|
@ -226,7 +226,7 @@ func addTestWorker(t *testing.T, sched *Scheduler, index *paths.Index, name stri
|
||||
}
|
||||
|
||||
func TestSchedStartStop(t *testing.T) {
|
||||
sched, err := newScheduler("")
|
||||
sched, err := newScheduler(context.Background(), "")
|
||||
require.NoError(t, err)
|
||||
go sched.runSched()
|
||||
|
||||
@ -356,7 +356,7 @@ func TestSched(t *testing.T) {
|
||||
return func(t *testing.T) {
|
||||
index := paths.NewIndex(nil)
|
||||
|
||||
sched, err := newScheduler("")
|
||||
sched, err := newScheduler(ctx, "")
|
||||
require.NoError(t, err)
|
||||
sched.testSync = make(chan struct{})
|
||||
|
||||
@ -609,7 +609,7 @@ func BenchmarkTrySched(b *testing.B) {
|
||||
for i := 0; i < b.N; i++ {
|
||||
b.StopTimer()
|
||||
|
||||
sched, err := newScheduler("")
|
||||
sched, err := newScheduler(ctx, "")
|
||||
require.NoError(b, err)
|
||||
sched.Workers[storiface.WorkerID{}] = &WorkerHandle{
|
||||
workerRpc: nil,
|
||||
|
Loading…
Reference in New Issue
Block a user