fix: curio taskstorage: Don't try to free reservations by nulled TaskID (#12018)

This commit is contained in:
Łukasz Magiera 2024-05-20 13:04:37 +02:00 committed by GitHub
parent 1afe58dcad
commit db105f1cb6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 15 additions and 21 deletions

View File

@ -115,7 +115,7 @@ func (t *TaskStorage) HasCapacity() bool {
return false // no path found
}
func (t *TaskStorage) Claim(taskID int) error {
func (t *TaskStorage) Claim(taskID int) (func() error, error) {
// TaskStorage Claim Attempts to reserve storage for the task
// A: Create a reservation for files to be allocated
// B: Create a reservation for existing files to be fetched into local storage
@ -125,7 +125,7 @@ func (t *TaskStorage) Claim(taskID int) error {
sectorRef, err := t.taskToSectorRef(harmonytask.TaskID(taskID))
if err != nil {
return xerrors.Errorf("getting sector ref: %w", err)
return nil, xerrors.Errorf("getting sector ref: %w", err)
}
// storage writelock sector
@ -147,12 +147,12 @@ func (t *TaskStorage) Claim(taskID int) error {
if err := t.sc.sectors.sindex.StorageLock(lkctx, sectorRef.ID(), storiface.FTNone, requestedTypes); err != nil {
// timer will expire
return xerrors.Errorf("claim StorageLock: %w", err)
return nil, xerrors.Errorf("claim StorageLock: %w", err)
}
if !lockAcquireTimer.Stop() {
// timer expired, so lkctx is done, and that means the lock was acquired and dropped..
return xerrors.Errorf("failed to acquire lock")
return nil, xerrors.Errorf("failed to acquire lock")
}
defer func() {
// make sure we release the sector lock
@ -166,13 +166,13 @@ func (t *TaskStorage) Claim(taskID int) error {
// paths to be used.
pathsFs, pathIDs, err := t.sc.sectors.localStore.AcquireSector(ctx, sectorRef.Ref(), storiface.FTNone, requestedTypes, t.pathType, storiface.AcquireMove)
if err != nil {
return err
return nil, err
}
// reserve the space
release, err := t.sc.sectors.localStore.Reserve(ctx, sectorRef.Ref(), requestedTypes, pathIDs, storiface.FSOverheadSeal, t.MinFreeStoragePercentage)
if err != nil {
return err
return nil, err
}
var releaseOnce sync.Once
@ -197,19 +197,16 @@ func (t *TaskStorage) Claim(taskID int) error {
// note: we drop the sector writelock on return; THAT IS INTENTIONAL, this code runs in CanAccept, which doesn't
// guarantee that the work for this sector will happen on this node; SDR CanAccept just ensures that the node can
// run the job, harmonytask is what ensures that only one SDR runs at a time
return nil
return func() error {
return t.markComplete(taskID, sectorRef)
}, nil
}
func (t *TaskStorage) MarkComplete(taskID int) error {
func (t *TaskStorage) markComplete(taskID int, sectorRef SectorRef) error {
// MarkComplete is ALWAYS called after the task is done or not scheduled
// If Claim is called and returns without errors, MarkComplete with the same
// taskID is guaranteed to eventually be called
sectorRef, err := t.taskToSectorRef(harmonytask.TaskID(taskID))
if err != nil {
return xerrors.Errorf("getting sector ref: %w", err)
}
sres, ok := t.sc.sectors.storageReservations.Load(harmonytask.TaskID(taskID))
if !ok {
return xerrors.Errorf("no reservation found for task %d", taskID)

View File

@ -105,7 +105,8 @@ canAcceptAgain:
releaseStorage := func() {
}
if h.TaskTypeDetails.Cost.Storage != nil {
if err = h.TaskTypeDetails.Cost.Storage.Claim(int(*tID)); err != nil {
markComplete, err := h.TaskTypeDetails.Cost.Storage.Claim(int(*tID))
if err != nil {
log.Infow("did not accept task", "task_id", strconv.Itoa(int(*tID)), "reason", "storage claim failed", "name", h.Name, "error", err)
if len(ids) > 1 {
@ -122,7 +123,7 @@ canAcceptAgain:
return false
}
releaseStorage = func() {
if err := h.TaskTypeDetails.Cost.Storage.MarkComplete(int(*tID)); err != nil {
if err := markComplete(); err != nil {
log.Errorw("Could not release storage", "error", err)
}
}

View File

@ -31,12 +31,8 @@ type Resources struct {
type Storage interface {
HasCapacity() bool
// This allows some other system to claim space for this task.
Claim(taskID int) error
// This allows some other system to consider the task done.
// It's up to the caller to remove the data, if that applies.
MarkComplete(taskID int) error
// This allows some other system to claim space for this task. Returns a cleanup function
Claim(taskID int) (func() error, error)
}
type Reg struct {
Resources