Merge pull request #9738 from filecoin-project/feat/sched-cycle-metrics

feat: sched: Add metrics around sched cycle
This commit is contained in:
Łukasz Magiera 2022-11-28 23:57:58 +01:00 committed by GitHub
commit d82b2a5804
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 74 additions and 7 deletions

View File

@ -14,7 +14,7 @@ import (
)
// Distribution
var defaultMillisecondsDistribution = view.Distribution(0.01, 0.05, 0.1, 0.3, 0.6, 0.8, 1, 2, 3, 4, 5, 6, 8, 10, 13, 16, 20, 25, 30, 40, 50, 65, 80, 100, 130, 160, 200, 250, 300, 400, 500, 650, 800, 1000, 2000, 3000, 4000, 5000, 7500, 10000, 20000, 50000, 100000)
var defaultMillisecondsDistribution = view.Distribution(0.01, 0.05, 0.1, 0.3, 0.6, 0.8, 1, 2, 3, 4, 5, 6, 8, 10, 13, 16, 20, 25, 30, 40, 50, 65, 80, 100, 130, 160, 200, 250, 300, 400, 500, 650, 800, 1000, 2000, 3000, 4000, 5000, 7500, 10000, 20000, 50000, 100_000, 250_000, 500_000, 1000_000)
var workMillisecondsDistribution = view.Distribution(
250, 500, 1000, 2000, 5000, 10_000, 30_000, 60_000, 2*60_000, 5*60_000, 10*60_000, 15*60_000, 30*60_000, // short sealing tasks
40*60_000, 45*60_000, 50*60_000, 55*60_000, 60*60_000, 65*60_000, 70*60_000, 75*60_000, 80*60_000, 85*60_000, 100*60_000, 120*60_000, // PC2 / C2 range
@ -22,6 +22,8 @@ var workMillisecondsDistribution = view.Distribution(
350*60_000, 400*60_000, 600*60_000, 800*60_000, 1000*60_000, 1300*60_000, 1800*60_000, 4000*60_000, 10000*60_000, // intel PC1 range
)
var queueSizeDistribution = view.Distribution(0, 1, 2, 3, 5, 7, 10, 15, 25, 35, 50, 70, 90, 130, 200, 300, 500, 1000, 2000, 5000, 10000)
// Global Tags
var (
// common
@ -136,6 +138,13 @@ var (
StorageLimitUsedBytes = stats.Int64("storage/path_limit_used_bytes", "used optional storage limit bytes", stats.UnitBytes)
StorageLimitMaxBytes = stats.Int64("storage/path_limit_max_bytes", "optional storage limit", stats.UnitBytes)
SchedAssignerCycleDuration = stats.Float64("sched/assigner_cycle_ms", "Duration of scheduler assigner cycle", stats.UnitMilliseconds)
SchedAssignerCandidatesDuration = stats.Float64("sched/assigner_cycle_candidates_ms", "Duration of scheduler assigner candidate matching step", stats.UnitMilliseconds)
SchedAssignerWindowSelectionDuration = stats.Float64("sched/assigner_cycle_window_select_ms", "Duration of scheduler window selection step", stats.UnitMilliseconds)
SchedAssignerSubmitDuration = stats.Float64("sched/assigner_cycle_submit_ms", "Duration of scheduler window submit step", stats.UnitMilliseconds)
SchedCycleOpenWindows = stats.Int64("sched/assigner_cycle_open_window", "Number of open windows in scheduling cycles", stats.UnitDimensionless)
SchedCycleQueueSize = stats.Int64("sched/assigner_cycle_task_queue_entry", "Number of task queue entries in scheduling cycles", stats.UnitDimensionless)
DagStorePRInitCount = stats.Int64("dagstore/pr_init_count", "PieceReader init count", stats.UnitDimensionless)
DagStorePRBytesRequested = stats.Int64("dagstore/pr_requested_bytes", "PieceReader requested bytes", stats.UnitBytes)
DagStorePRBytesDiscarded = stats.Int64("dagstore/pr_discarded_bytes", "PieceReader discarded bytes", stats.UnitBytes)
@ -428,6 +437,31 @@ var (
TagKeys: []tag.Key{StorageID},
}
SchedAssignerCycleDurationView = &view.View{
Measure: SchedAssignerCycleDuration,
Aggregation: defaultMillisecondsDistribution,
}
SchedAssignerCandidatesDurationView = &view.View{
Measure: SchedAssignerCandidatesDuration,
Aggregation: defaultMillisecondsDistribution,
}
SchedAssignerWindowSelectionDurationView = &view.View{
Measure: SchedAssignerWindowSelectionDuration,
Aggregation: defaultMillisecondsDistribution,
}
SchedAssignerSubmitDurationView = &view.View{
Measure: SchedAssignerSubmitDuration,
Aggregation: defaultMillisecondsDistribution,
}
SchedCycleOpenWindowsView = &view.View{
Measure: SchedCycleOpenWindows,
Aggregation: queueSizeDistribution,
}
SchedCycleQueueSizeView = &view.View{
Measure: SchedCycleQueueSize,
Aggregation: queueSizeDistribution,
}
DagStorePRInitCountView = &view.View{
Measure: DagStorePRInitCount,
Aggregation: view.Count(),
@ -697,6 +731,7 @@ var MinerNodeViews = append([]*view.View{
WorkerCallsReturnedCountView,
WorkerUntrackedCallsReturnedView,
WorkerCallsReturnedDurationView,
SectorStatesView,
StorageFSAvailableView,
StorageAvailableView,
@ -708,6 +743,14 @@ var MinerNodeViews = append([]*view.View{
StorageReservedBytesView,
StorageLimitUsedBytesView,
StorageLimitMaxBytesView,
SchedAssignerCycleDurationView,
SchedAssignerCandidatesDurationView,
SchedAssignerWindowSelectionDurationView,
SchedAssignerSubmitDurationView,
SchedCycleOpenWindowsView,
SchedCycleQueueSizeView,
DagStorePRInitCountView,
DagStorePRBytesRequestedView,
DagStorePRBytesDiscardedView,

View File

@ -105,7 +105,7 @@ func New(ctx context.Context, lstor *paths.Local, stor paths.Store, ls paths.Loc
return nil, xerrors.Errorf("creating prover instance: %w", err)
}
sh, err := newScheduler(sc.Assigner)
sh, err := newScheduler(ctx, sc.Assigner)
if err != nil {
return nil, err
}

View File

@ -110,7 +110,7 @@ func newTestMgr(ctx context.Context, t *testing.T, ds datastore.Datastore) (*Man
stor := paths.NewRemote(lstor, si, nil, 6000, &paths.DefaultPartialFileHandler{})
sh, err := newScheduler("")
sh, err := newScheduler(ctx, "")
require.NoError(t, err)
m := &Manager{

View File

@ -10,6 +10,7 @@ import (
"github.com/filecoin-project/go-state-types/abi"
"github.com/filecoin-project/lotus/metrics"
"github.com/filecoin-project/lotus/storage/sealer/sealtasks"
"github.com/filecoin-project/lotus/storage/sealer/storiface"
)
@ -51,6 +52,8 @@ type WorkerSelector interface {
}
type Scheduler struct {
mctx context.Context // metrics context
assigner Assigner
workersLk sync.RWMutex
@ -146,7 +149,7 @@ type rmRequest struct {
res chan error
}
func newScheduler(assigner string) (*Scheduler, error) {
func newScheduler(ctx context.Context, assigner string) (*Scheduler, error) {
var a Assigner
switch assigner {
case "", "utilization":
@ -158,6 +161,7 @@ func newScheduler(assigner string) (*Scheduler, error) {
}
return &Scheduler{
mctx: ctx,
assigner: a,
Workers: map[storiface.WorkerID]*WorkerHandle{},
@ -366,6 +370,9 @@ func (sh *Scheduler) trySched() {
sh.workersLk.RLock()
defer sh.workersLk.RUnlock()
done := metrics.Timer(sh.mctx, metrics.SchedAssignerCycleDuration)
defer done()
sh.assigner.TrySched(sh)
}

View File

@ -5,6 +5,10 @@ import (
"math/rand"
"sort"
"sync"
"go.opencensus.io/stats"
"github.com/filecoin-project/lotus/metrics"
)
type WindowSelector func(sh *Scheduler, queueLen int, acceptableWindows [][]int, windows []SchedWindow) int
@ -36,6 +40,9 @@ func (a *AssignerCommon) TrySched(sh *Scheduler) {
windowsLen := len(sh.OpenWindows)
queueLen := sh.SchedQueue.Len()
stats.Record(sh.mctx, metrics.SchedCycleOpenWindows.M(int64(windowsLen)))
stats.Record(sh.mctx, metrics.SchedCycleQueueSize.M(int64(queueLen)))
log.Debugf("SCHED %d queued; %d open windows", queueLen, windowsLen)
if windowsLen == 0 || queueLen == 0 {
@ -52,6 +59,11 @@ func (a *AssignerCommon) TrySched(sh *Scheduler) {
// Step 1
throttle := make(chan struct{}, windowsLen)
partDone := metrics.Timer(sh.mctx, metrics.SchedAssignerCandidatesDuration)
defer func() {
partDone()
}()
var wg sync.WaitGroup
wg.Add(queueLen)
for i := 0; i < queueLen; i++ {
@ -151,9 +163,14 @@ func (a *AssignerCommon) TrySched(sh *Scheduler) {
log.Debugf("SCHED Acceptable win: %+v", acceptableWindows)
// Step 2
partDone()
partDone = metrics.Timer(sh.mctx, metrics.SchedAssignerWindowSelectionDuration)
scheduled := a.WindowSel(sh, queueLen, acceptableWindows, windows)
// Step 3
partDone()
partDone = metrics.Timer(sh.mctx, metrics.SchedAssignerSubmitDuration)
if scheduled == 0 {
return

View File

@ -226,7 +226,7 @@ func addTestWorker(t *testing.T, sched *Scheduler, index *paths.Index, name stri
}
func TestSchedStartStop(t *testing.T) {
sched, err := newScheduler("")
sched, err := newScheduler(context.Background(), "")
require.NoError(t, err)
go sched.runSched()
@ -356,7 +356,7 @@ func TestSched(t *testing.T) {
return func(t *testing.T) {
index := paths.NewIndex(nil)
sched, err := newScheduler("")
sched, err := newScheduler(ctx, "")
require.NoError(t, err)
sched.testSync = make(chan struct{})
@ -609,7 +609,7 @@ func BenchmarkTrySched(b *testing.B) {
for i := 0; i < b.N; i++ {
b.StopTimer()
sched, err := newScheduler("")
sched, err := newScheduler(ctx, "")
require.NoError(b, err)
sched.Workers[storiface.WorkerID{}] = &WorkerHandle{
workerRpc: nil,