Merge pull request #9738 from filecoin-project/feat/sched-cycle-metrics
feat: sched: Add metrics around sched cycle
This commit is contained in:
commit
d82b2a5804
@ -14,7 +14,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
// Distribution
|
// Distribution
|
||||||
var defaultMillisecondsDistribution = view.Distribution(0.01, 0.05, 0.1, 0.3, 0.6, 0.8, 1, 2, 3, 4, 5, 6, 8, 10, 13, 16, 20, 25, 30, 40, 50, 65, 80, 100, 130, 160, 200, 250, 300, 400, 500, 650, 800, 1000, 2000, 3000, 4000, 5000, 7500, 10000, 20000, 50000, 100000)
|
var defaultMillisecondsDistribution = view.Distribution(0.01, 0.05, 0.1, 0.3, 0.6, 0.8, 1, 2, 3, 4, 5, 6, 8, 10, 13, 16, 20, 25, 30, 40, 50, 65, 80, 100, 130, 160, 200, 250, 300, 400, 500, 650, 800, 1000, 2000, 3000, 4000, 5000, 7500, 10000, 20000, 50000, 100_000, 250_000, 500_000, 1000_000)
|
||||||
var workMillisecondsDistribution = view.Distribution(
|
var workMillisecondsDistribution = view.Distribution(
|
||||||
250, 500, 1000, 2000, 5000, 10_000, 30_000, 60_000, 2*60_000, 5*60_000, 10*60_000, 15*60_000, 30*60_000, // short sealing tasks
|
250, 500, 1000, 2000, 5000, 10_000, 30_000, 60_000, 2*60_000, 5*60_000, 10*60_000, 15*60_000, 30*60_000, // short sealing tasks
|
||||||
40*60_000, 45*60_000, 50*60_000, 55*60_000, 60*60_000, 65*60_000, 70*60_000, 75*60_000, 80*60_000, 85*60_000, 100*60_000, 120*60_000, // PC2 / C2 range
|
40*60_000, 45*60_000, 50*60_000, 55*60_000, 60*60_000, 65*60_000, 70*60_000, 75*60_000, 80*60_000, 85*60_000, 100*60_000, 120*60_000, // PC2 / C2 range
|
||||||
@ -22,6 +22,8 @@ var workMillisecondsDistribution = view.Distribution(
|
|||||||
350*60_000, 400*60_000, 600*60_000, 800*60_000, 1000*60_000, 1300*60_000, 1800*60_000, 4000*60_000, 10000*60_000, // intel PC1 range
|
350*60_000, 400*60_000, 600*60_000, 800*60_000, 1000*60_000, 1300*60_000, 1800*60_000, 4000*60_000, 10000*60_000, // intel PC1 range
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var queueSizeDistribution = view.Distribution(0, 1, 2, 3, 5, 7, 10, 15, 25, 35, 50, 70, 90, 130, 200, 300, 500, 1000, 2000, 5000, 10000)
|
||||||
|
|
||||||
// Global Tags
|
// Global Tags
|
||||||
var (
|
var (
|
||||||
// common
|
// common
|
||||||
@ -136,6 +138,13 @@ var (
|
|||||||
StorageLimitUsedBytes = stats.Int64("storage/path_limit_used_bytes", "used optional storage limit bytes", stats.UnitBytes)
|
StorageLimitUsedBytes = stats.Int64("storage/path_limit_used_bytes", "used optional storage limit bytes", stats.UnitBytes)
|
||||||
StorageLimitMaxBytes = stats.Int64("storage/path_limit_max_bytes", "optional storage limit", stats.UnitBytes)
|
StorageLimitMaxBytes = stats.Int64("storage/path_limit_max_bytes", "optional storage limit", stats.UnitBytes)
|
||||||
|
|
||||||
|
SchedAssignerCycleDuration = stats.Float64("sched/assigner_cycle_ms", "Duration of scheduler assigner cycle", stats.UnitMilliseconds)
|
||||||
|
SchedAssignerCandidatesDuration = stats.Float64("sched/assigner_cycle_candidates_ms", "Duration of scheduler assigner candidate matching step", stats.UnitMilliseconds)
|
||||||
|
SchedAssignerWindowSelectionDuration = stats.Float64("sched/assigner_cycle_window_select_ms", "Duration of scheduler window selection step", stats.UnitMilliseconds)
|
||||||
|
SchedAssignerSubmitDuration = stats.Float64("sched/assigner_cycle_submit_ms", "Duration of scheduler window submit step", stats.UnitMilliseconds)
|
||||||
|
SchedCycleOpenWindows = stats.Int64("sched/assigner_cycle_open_window", "Number of open windows in scheduling cycles", stats.UnitDimensionless)
|
||||||
|
SchedCycleQueueSize = stats.Int64("sched/assigner_cycle_task_queue_entry", "Number of task queue entries in scheduling cycles", stats.UnitDimensionless)
|
||||||
|
|
||||||
DagStorePRInitCount = stats.Int64("dagstore/pr_init_count", "PieceReader init count", stats.UnitDimensionless)
|
DagStorePRInitCount = stats.Int64("dagstore/pr_init_count", "PieceReader init count", stats.UnitDimensionless)
|
||||||
DagStorePRBytesRequested = stats.Int64("dagstore/pr_requested_bytes", "PieceReader requested bytes", stats.UnitBytes)
|
DagStorePRBytesRequested = stats.Int64("dagstore/pr_requested_bytes", "PieceReader requested bytes", stats.UnitBytes)
|
||||||
DagStorePRBytesDiscarded = stats.Int64("dagstore/pr_discarded_bytes", "PieceReader discarded bytes", stats.UnitBytes)
|
DagStorePRBytesDiscarded = stats.Int64("dagstore/pr_discarded_bytes", "PieceReader discarded bytes", stats.UnitBytes)
|
||||||
@ -428,6 +437,31 @@ var (
|
|||||||
TagKeys: []tag.Key{StorageID},
|
TagKeys: []tag.Key{StorageID},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SchedAssignerCycleDurationView = &view.View{
|
||||||
|
Measure: SchedAssignerCycleDuration,
|
||||||
|
Aggregation: defaultMillisecondsDistribution,
|
||||||
|
}
|
||||||
|
SchedAssignerCandidatesDurationView = &view.View{
|
||||||
|
Measure: SchedAssignerCandidatesDuration,
|
||||||
|
Aggregation: defaultMillisecondsDistribution,
|
||||||
|
}
|
||||||
|
SchedAssignerWindowSelectionDurationView = &view.View{
|
||||||
|
Measure: SchedAssignerWindowSelectionDuration,
|
||||||
|
Aggregation: defaultMillisecondsDistribution,
|
||||||
|
}
|
||||||
|
SchedAssignerSubmitDurationView = &view.View{
|
||||||
|
Measure: SchedAssignerSubmitDuration,
|
||||||
|
Aggregation: defaultMillisecondsDistribution,
|
||||||
|
}
|
||||||
|
SchedCycleOpenWindowsView = &view.View{
|
||||||
|
Measure: SchedCycleOpenWindows,
|
||||||
|
Aggregation: queueSizeDistribution,
|
||||||
|
}
|
||||||
|
SchedCycleQueueSizeView = &view.View{
|
||||||
|
Measure: SchedCycleQueueSize,
|
||||||
|
Aggregation: queueSizeDistribution,
|
||||||
|
}
|
||||||
|
|
||||||
DagStorePRInitCountView = &view.View{
|
DagStorePRInitCountView = &view.View{
|
||||||
Measure: DagStorePRInitCount,
|
Measure: DagStorePRInitCount,
|
||||||
Aggregation: view.Count(),
|
Aggregation: view.Count(),
|
||||||
@ -697,6 +731,7 @@ var MinerNodeViews = append([]*view.View{
|
|||||||
WorkerCallsReturnedCountView,
|
WorkerCallsReturnedCountView,
|
||||||
WorkerUntrackedCallsReturnedView,
|
WorkerUntrackedCallsReturnedView,
|
||||||
WorkerCallsReturnedDurationView,
|
WorkerCallsReturnedDurationView,
|
||||||
|
|
||||||
SectorStatesView,
|
SectorStatesView,
|
||||||
StorageFSAvailableView,
|
StorageFSAvailableView,
|
||||||
StorageAvailableView,
|
StorageAvailableView,
|
||||||
@ -708,6 +743,14 @@ var MinerNodeViews = append([]*view.View{
|
|||||||
StorageReservedBytesView,
|
StorageReservedBytesView,
|
||||||
StorageLimitUsedBytesView,
|
StorageLimitUsedBytesView,
|
||||||
StorageLimitMaxBytesView,
|
StorageLimitMaxBytesView,
|
||||||
|
|
||||||
|
SchedAssignerCycleDurationView,
|
||||||
|
SchedAssignerCandidatesDurationView,
|
||||||
|
SchedAssignerWindowSelectionDurationView,
|
||||||
|
SchedAssignerSubmitDurationView,
|
||||||
|
SchedCycleOpenWindowsView,
|
||||||
|
SchedCycleQueueSizeView,
|
||||||
|
|
||||||
DagStorePRInitCountView,
|
DagStorePRInitCountView,
|
||||||
DagStorePRBytesRequestedView,
|
DagStorePRBytesRequestedView,
|
||||||
DagStorePRBytesDiscardedView,
|
DagStorePRBytesDiscardedView,
|
||||||
|
@ -105,7 +105,7 @@ func New(ctx context.Context, lstor *paths.Local, stor paths.Store, ls paths.Loc
|
|||||||
return nil, xerrors.Errorf("creating prover instance: %w", err)
|
return nil, xerrors.Errorf("creating prover instance: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
sh, err := newScheduler(sc.Assigner)
|
sh, err := newScheduler(ctx, sc.Assigner)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
@ -110,7 +110,7 @@ func newTestMgr(ctx context.Context, t *testing.T, ds datastore.Datastore) (*Man
|
|||||||
|
|
||||||
stor := paths.NewRemote(lstor, si, nil, 6000, &paths.DefaultPartialFileHandler{})
|
stor := paths.NewRemote(lstor, si, nil, 6000, &paths.DefaultPartialFileHandler{})
|
||||||
|
|
||||||
sh, err := newScheduler("")
|
sh, err := newScheduler(ctx, "")
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
m := &Manager{
|
m := &Manager{
|
||||||
|
@ -10,6 +10,7 @@ import (
|
|||||||
|
|
||||||
"github.com/filecoin-project/go-state-types/abi"
|
"github.com/filecoin-project/go-state-types/abi"
|
||||||
|
|
||||||
|
"github.com/filecoin-project/lotus/metrics"
|
||||||
"github.com/filecoin-project/lotus/storage/sealer/sealtasks"
|
"github.com/filecoin-project/lotus/storage/sealer/sealtasks"
|
||||||
"github.com/filecoin-project/lotus/storage/sealer/storiface"
|
"github.com/filecoin-project/lotus/storage/sealer/storiface"
|
||||||
)
|
)
|
||||||
@ -51,6 +52,8 @@ type WorkerSelector interface {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type Scheduler struct {
|
type Scheduler struct {
|
||||||
|
mctx context.Context // metrics context
|
||||||
|
|
||||||
assigner Assigner
|
assigner Assigner
|
||||||
|
|
||||||
workersLk sync.RWMutex
|
workersLk sync.RWMutex
|
||||||
@ -146,7 +149,7 @@ type rmRequest struct {
|
|||||||
res chan error
|
res chan error
|
||||||
}
|
}
|
||||||
|
|
||||||
func newScheduler(assigner string) (*Scheduler, error) {
|
func newScheduler(ctx context.Context, assigner string) (*Scheduler, error) {
|
||||||
var a Assigner
|
var a Assigner
|
||||||
switch assigner {
|
switch assigner {
|
||||||
case "", "utilization":
|
case "", "utilization":
|
||||||
@ -158,6 +161,7 @@ func newScheduler(assigner string) (*Scheduler, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
return &Scheduler{
|
return &Scheduler{
|
||||||
|
mctx: ctx,
|
||||||
assigner: a,
|
assigner: a,
|
||||||
|
|
||||||
Workers: map[storiface.WorkerID]*WorkerHandle{},
|
Workers: map[storiface.WorkerID]*WorkerHandle{},
|
||||||
@ -366,6 +370,9 @@ func (sh *Scheduler) trySched() {
|
|||||||
sh.workersLk.RLock()
|
sh.workersLk.RLock()
|
||||||
defer sh.workersLk.RUnlock()
|
defer sh.workersLk.RUnlock()
|
||||||
|
|
||||||
|
done := metrics.Timer(sh.mctx, metrics.SchedAssignerCycleDuration)
|
||||||
|
defer done()
|
||||||
|
|
||||||
sh.assigner.TrySched(sh)
|
sh.assigner.TrySched(sh)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5,6 +5,10 @@ import (
|
|||||||
"math/rand"
|
"math/rand"
|
||||||
"sort"
|
"sort"
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
|
"go.opencensus.io/stats"
|
||||||
|
|
||||||
|
"github.com/filecoin-project/lotus/metrics"
|
||||||
)
|
)
|
||||||
|
|
||||||
type WindowSelector func(sh *Scheduler, queueLen int, acceptableWindows [][]int, windows []SchedWindow) int
|
type WindowSelector func(sh *Scheduler, queueLen int, acceptableWindows [][]int, windows []SchedWindow) int
|
||||||
@ -36,6 +40,9 @@ func (a *AssignerCommon) TrySched(sh *Scheduler) {
|
|||||||
windowsLen := len(sh.OpenWindows)
|
windowsLen := len(sh.OpenWindows)
|
||||||
queueLen := sh.SchedQueue.Len()
|
queueLen := sh.SchedQueue.Len()
|
||||||
|
|
||||||
|
stats.Record(sh.mctx, metrics.SchedCycleOpenWindows.M(int64(windowsLen)))
|
||||||
|
stats.Record(sh.mctx, metrics.SchedCycleQueueSize.M(int64(queueLen)))
|
||||||
|
|
||||||
log.Debugf("SCHED %d queued; %d open windows", queueLen, windowsLen)
|
log.Debugf("SCHED %d queued; %d open windows", queueLen, windowsLen)
|
||||||
|
|
||||||
if windowsLen == 0 || queueLen == 0 {
|
if windowsLen == 0 || queueLen == 0 {
|
||||||
@ -52,6 +59,11 @@ func (a *AssignerCommon) TrySched(sh *Scheduler) {
|
|||||||
// Step 1
|
// Step 1
|
||||||
throttle := make(chan struct{}, windowsLen)
|
throttle := make(chan struct{}, windowsLen)
|
||||||
|
|
||||||
|
partDone := metrics.Timer(sh.mctx, metrics.SchedAssignerCandidatesDuration)
|
||||||
|
defer func() {
|
||||||
|
partDone()
|
||||||
|
}()
|
||||||
|
|
||||||
var wg sync.WaitGroup
|
var wg sync.WaitGroup
|
||||||
wg.Add(queueLen)
|
wg.Add(queueLen)
|
||||||
for i := 0; i < queueLen; i++ {
|
for i := 0; i < queueLen; i++ {
|
||||||
@ -151,9 +163,14 @@ func (a *AssignerCommon) TrySched(sh *Scheduler) {
|
|||||||
log.Debugf("SCHED Acceptable win: %+v", acceptableWindows)
|
log.Debugf("SCHED Acceptable win: %+v", acceptableWindows)
|
||||||
|
|
||||||
// Step 2
|
// Step 2
|
||||||
|
partDone()
|
||||||
|
partDone = metrics.Timer(sh.mctx, metrics.SchedAssignerWindowSelectionDuration)
|
||||||
|
|
||||||
scheduled := a.WindowSel(sh, queueLen, acceptableWindows, windows)
|
scheduled := a.WindowSel(sh, queueLen, acceptableWindows, windows)
|
||||||
|
|
||||||
// Step 3
|
// Step 3
|
||||||
|
partDone()
|
||||||
|
partDone = metrics.Timer(sh.mctx, metrics.SchedAssignerSubmitDuration)
|
||||||
|
|
||||||
if scheduled == 0 {
|
if scheduled == 0 {
|
||||||
return
|
return
|
||||||
|
@ -226,7 +226,7 @@ func addTestWorker(t *testing.T, sched *Scheduler, index *paths.Index, name stri
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestSchedStartStop(t *testing.T) {
|
func TestSchedStartStop(t *testing.T) {
|
||||||
sched, err := newScheduler("")
|
sched, err := newScheduler(context.Background(), "")
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
go sched.runSched()
|
go sched.runSched()
|
||||||
|
|
||||||
@ -356,7 +356,7 @@ func TestSched(t *testing.T) {
|
|||||||
return func(t *testing.T) {
|
return func(t *testing.T) {
|
||||||
index := paths.NewIndex(nil)
|
index := paths.NewIndex(nil)
|
||||||
|
|
||||||
sched, err := newScheduler("")
|
sched, err := newScheduler(ctx, "")
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
sched.testSync = make(chan struct{})
|
sched.testSync = make(chan struct{})
|
||||||
|
|
||||||
@ -609,7 +609,7 @@ func BenchmarkTrySched(b *testing.B) {
|
|||||||
for i := 0; i < b.N; i++ {
|
for i := 0; i < b.N; i++ {
|
||||||
b.StopTimer()
|
b.StopTimer()
|
||||||
|
|
||||||
sched, err := newScheduler("")
|
sched, err := newScheduler(ctx, "")
|
||||||
require.NoError(b, err)
|
require.NoError(b, err)
|
||||||
sched.Workers[storiface.WorkerID{}] = &WorkerHandle{
|
sched.Workers[storiface.WorkerID{}] = &WorkerHandle{
|
||||||
workerRpc: nil,
|
workerRpc: nil,
|
||||||
|
Loading…
Reference in New Issue
Block a user