lotus/htask.patch

1661 lines
63 KiB
Diff
Raw Normal View History

2023-08-14 16:40:12 +00:00
diff --git a/cmd/lotus-worker/main.go b/cmd/lotus-worker/main.go
index 944791275..995a3cbe0 100644
--- a/cmd/lotus-worker/main.go
+++ b/cmd/lotus-worker/main.go
@@ -609,6 +609,7 @@ var runCmd = &cli.Command{
if err := srv.Shutdown(context.TODO()); err != nil {
log.Errorf("shutting down RPC server failed: %s", err)
}
+ //taskManager.GracefullyTerminate(5*time.Hour)
log.Warn("Graceful shutdown successful")
}()
diff --git a/go.mod b/go.mod
index 2da784ad6..661495e89 100644
--- a/go.mod
+++ b/go.mod
@@ -156,7 +156,7 @@ require (
golang.org/x/exp v0.0.0-20230321023759-10a507213a29
golang.org/x/net v0.10.0
golang.org/x/sync v0.2.0
- golang.org/x/sys v0.9.0
+ golang.org/x/sys v0.10.0
golang.org/x/term v0.9.0
golang.org/x/time v0.0.0-20220722155302-e5dcc9cfc0b9
golang.org/x/tools v0.9.1
@@ -167,6 +167,8 @@ require (
require (
github.com/GeertJohan/go.incremental v1.0.0 // indirect
+ github.com/Inkeliz/go-opencl v0.0.0-20200806180703-5f0707fba006 // indirect
+ github.com/Nv7-Github/go-cl v0.0.0-20210426150049-f121093b60ef // indirect
github.com/PuerkitoBio/purell v1.1.1 // indirect
github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 // indirect
github.com/StackExchange/wmi v1.2.1 // indirect
@@ -177,8 +179,10 @@ require (
github.com/beorn7/perks v1.0.1 // indirect
github.com/bep/debounce v1.2.1 // indirect
github.com/boltdb/bolt v1.3.1 // indirect
+ github.com/bytedance/sonic v1.9.1 // indirect
github.com/cespare/xxhash v1.1.0 // indirect
github.com/cespare/xxhash/v2 v2.2.0 // indirect
+ github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 // indirect
github.com/cilium/ebpf v0.9.1 // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.2 // indirect
github.com/crackcomm/go-gitignore v0.0.0-20170627025303-887ab5e44cc3 // indirect
@@ -202,7 +206,10 @@ require (
github.com/flynn/noise v1.0.0 // indirect
github.com/francoispqt/gojay v1.2.13 // indirect
github.com/fsnotify/fsnotify v1.6.0 // indirect
+ github.com/gabriel-vasile/mimetype v1.4.2 // indirect
github.com/gdamore/encoding v1.0.0 // indirect
+ github.com/gin-contrib/sse v0.1.0 // indirect
+ github.com/gin-gonic/gin v1.9.1 // indirect
github.com/go-kit/log v0.2.1 // indirect
github.com/go-logfmt/logfmt v0.5.1 // indirect
github.com/go-logr/logr v1.2.4 // indirect
@@ -211,7 +218,11 @@ require (
github.com/go-openapi/jsonpointer v0.19.3 // indirect
github.com/go-openapi/jsonreference v0.19.4 // indirect
github.com/go-openapi/swag v0.19.11 // indirect
+ github.com/go-playground/locales v0.14.1 // indirect
+ github.com/go-playground/universal-translator v0.18.1 // indirect
+ github.com/go-playground/validator/v10 v10.14.0 // indirect
github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 // indirect
+ github.com/goccy/go-json v0.10.2 // indirect
github.com/godbus/dbus/v5 v5.1.0 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang/glog v1.1.0 // indirect
@@ -256,10 +267,12 @@ require (
github.com/joeshaw/multierror v0.0.0-20140124173710-69b34d4ec901 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/jpillora/backoff v1.0.0 // indirect
+ github.com/json-iterator/go v1.1.12 // indirect
github.com/kilic/bls12-381 v0.1.0 // indirect
github.com/klauspost/compress v1.16.5 // indirect
github.com/klauspost/cpuid/v2 v2.2.5 // indirect
github.com/koron/go-ssdp v0.0.4 // indirect
+ github.com/leodido/go-urn v1.2.4 // indirect
github.com/libp2p/go-cidranger v1.1.0 // indirect
github.com/libp2p/go-flow-metrics v0.1.0 // indirect
github.com/libp2p/go-libp2p-asn-util v0.3.0 // indirect
@@ -280,6 +293,8 @@ require (
github.com/mikioh/tcpinfo v0.0.0-20190314235526-30a79bb1804b // indirect
github.com/mikioh/tcpopt v0.0.0-20190314235656-172688c1accc // indirect
github.com/minio/sha256-simd v1.0.1 // indirect
+ github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
+ github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/mr-tron/base58 v1.2.0 // indirect
github.com/multiformats/go-base36 v0.2.0 // indirect
github.com/multiformats/go-multiaddr-fmt v0.1.0 // indirect
@@ -291,6 +306,7 @@ require (
github.com/opencontainers/runtime-spec v1.0.2 // indirect
github.com/opentracing/opentracing-go v1.2.0 // indirect
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 // indirect
+ github.com/pelletier/go-toml/v2 v2.0.8 // indirect
github.com/petar/GoLLRB v0.0.0-20210522233825-ae3b015fd3e9 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
@@ -306,12 +322,15 @@ require (
github.com/rivo/uniseg v0.1.0 // indirect
github.com/rs/cors v1.7.0 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
+ github.com/samber/lo v1.38.1 // indirect
+ github.com/samuel/go-opencl v0.0.0-20171108220231-cbcfd10c32ad // indirect
github.com/shirou/gopsutil v2.18.12+incompatible // indirect
github.com/sirupsen/logrus v1.9.0 // indirect
github.com/spaolacci/murmur3 v1.1.0 // indirect
github.com/tidwall/gjson v1.14.4 // indirect
+ github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
github.com/twmb/murmur3 v1.1.6 // indirect
- github.com/ugorji/go/codec v1.2.6 // indirect
+ github.com/ugorji/go/codec v1.2.11 // indirect
github.com/valyala/bytebufferpool v1.0.0 // indirect
github.com/valyala/fasttemplate v1.0.1 // indirect
github.com/whyrusleeping/cbor v0.0.0-20171005072247-63513f603b11 // indirect
@@ -327,6 +346,7 @@ require (
go.opentelemetry.io/otel/trace v1.16.0 // indirect
go.uber.org/dig v1.17.0 // indirect
go4.org v0.0.0-20230225012048-214862532bf5 // indirect
+ golang.org/x/arch v0.3.0 // indirect
golang.org/x/mod v0.10.0 // indirect
golang.org/x/text v0.10.0 // indirect
gonum.org/v1/gonum v0.13.0 // indirect
diff --git a/go.sum b/go.sum
index ebbc4dcc8..74127c535 100644
--- a/go.sum
+++ b/go.sum
@@ -59,6 +59,8 @@ github.com/GeertJohan/go.rice v1.0.3 h1:k5viR+xGtIhF61125vCE1cmJ5957RQGXG6dmbaWZ
github.com/GeertJohan/go.rice v1.0.3/go.mod h1:XVdrU4pW00M4ikZed5q56tPf1v2KwnIKeIdc9CBYNt4=
github.com/Gurpartap/async v0.0.0-20180927173644-4f7f499dd9ee h1:8doiS7ib3zi6/K172oDhSKU0dJ/miJramo9NITOMyZQ=
github.com/Gurpartap/async v0.0.0-20180927173644-4f7f499dd9ee/go.mod h1:W0GbEAA4uFNYOGG2cJpmFJ04E6SD1NLELPYZB57/7AY=
+github.com/Inkeliz/go-opencl v0.0.0-20200806180703-5f0707fba006 h1:TKWkFaRW5EPQyrS1pM0vm3vvqw/jmHu+FkV8gRD+7/w=
+github.com/Inkeliz/go-opencl v0.0.0-20200806180703-5f0707fba006/go.mod h1:9ILtD1/UTP/Y7JMCU8loWZMDvhrQuTgHzHatG6z9ZdQ=
github.com/Knetic/govaluate v3.0.1-0.20171022003610-9aa49832a739+incompatible/go.mod h1:r7JcOSlj0wfOMncg0iLm8Leh48TZaKVeNIfJntJ2wa0=
github.com/Kubuxu/go-os-helper v0.0.1/go.mod h1:N8B+I7vPCT80IcP58r50u4+gEEcsZETFUpAzWW2ep1Y=
github.com/Kubuxu/imtui v0.0.0-20210401140320-41663d68d0fa h1:1PPxEyGdIGVkX/kqMvLJ95a1dGS1Sz7tpNEgehEYYt0=
@@ -66,6 +68,8 @@ github.com/Kubuxu/imtui v0.0.0-20210401140320-41663d68d0fa/go.mod h1:WUmMvh9wMtq
github.com/Masterminds/glide v0.13.2/go.mod h1:STyF5vcenH/rUqTEv+/hBXlSTo7KYwg2oc2f4tzPWic=
github.com/Masterminds/semver v1.4.2/go.mod h1:MB6lktGJrhw8PrUyiEoblNEGEQ+RzHPF078ddwwvV3Y=
github.com/Masterminds/vcs v1.13.0/go.mod h1:N09YCmOQr6RLxC6UNHzuVwAdodYbbnycGHSmwVJjcKA=
+github.com/Nv7-Github/go-cl v0.0.0-20210426150049-f121093b60ef h1:DiNnYI6NBdeXGOJXptJcrYeDavJf4tImz/B4MOVQtMs=
+github.com/Nv7-Github/go-cl v0.0.0-20210426150049-f121093b60ef/go.mod h1:RRVtxaQlBBnbo+n2fgYHhxQmXDkRLKWcWX93lJL0Yhw=
github.com/OneOfOne/xxhash v1.2.2 h1:KMrpdQIwFcEqXDklaen+P1axHaj9BSKzvpUUfnHldSE=
github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU=
github.com/PuerkitoBio/purell v1.1.1 h1:WEQqlqaGbrPkxLJWfBwQmfEAE1Z7ONdDLqrN38tNFfI=
@@ -142,6 +146,9 @@ github.com/btcsuite/winsvc v1.0.0/go.mod h1:jsenWakMcC0zFBFurPLEAyrnc/teJEM1O46f
github.com/buger/goterm v1.0.3 h1:7V/HeAQHrzPk/U4BvyH2g9u+xbUW9nr4yRPyG59W4fM=
github.com/buger/goterm v1.0.3/go.mod h1:HiFWV3xnkolgrBV3mY8m0X0Pumt4zg4QhbdOzQtB8tE=
github.com/buger/jsonparser v0.0.0-20181115193947-bf1c66bbce23/go.mod h1:bbYlZJ7hK1yFx9hf58LP0zeX7UjIGs20ufpu3evjr+s=
+github.com/bytedance/sonic v1.5.0/go.mod h1:ED5hyg4y6t3/9Ku1R6dU/4KyJ48DZ4jPhfY1O2AihPM=
+github.com/bytedance/sonic v1.9.1 h1:6iJ6NqdoxCDr6mbY8h18oSO+cShGSMRGCEo7F2h0x8s=
+github.com/bytedance/sonic v1.9.1/go.mod h1:i736AoUSYt75HyZLoJW9ERYxcy6eaN6h4BZXU064P/U=
github.com/casbin/casbin/v2 v2.1.2/go.mod h1:YcPU1XXisHhLzuxH9coDNf2FbKpjGlbCg3n9yuLkIJQ=
github.com/cenkalti/backoff v2.2.1+incompatible/go.mod h1:90ReRw6GdpyfrHakVjL/QHaoyV4aDUVVkXQJJJ3NXXM=
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
@@ -152,6 +159,9 @@ github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XL
github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=
github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/cheekybits/genny v1.0.0/go.mod h1:+tQajlRqAUrPI7DOSpB0XAqZYtQakVtB7wXkRAgjxjQ=
+github.com/chenzhuoyu/base64x v0.0.0-20211019084208-fb5309c8db06/go.mod h1:DH46F32mSOjUmXrMHnKwZdA8wcEefY7UVqBKYGjpdQY=
+github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 h1:qSGYFH7+jGhDF8vLC+iwCD4WpbV1EBDSzWkJODFLams=
+github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311/go.mod h1:b583jCggY9gE99b6G5LEC39OIiVsWj+R97kbl5odCEk=
github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI=
github.com/chzyer/logex v1.2.1 h1:XHDu3E6q+gdHgsdTPH6ImJMIp436vR6MPtH8gP05QzM=
github.com/chzyer/logex v1.2.1/go.mod h1:JLbx6lG2kDbNRFnfkgvh4eRJRPX1QCoOIWomwysCBrQ=
@@ -386,6 +396,8 @@ github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMo
github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ=
github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4HY=
github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw=
+github.com/gabriel-vasile/mimetype v1.4.2 h1:w5qFW6JKBz9Y393Y4q372O9A7cUSequkh1Q7OhCmWKU=
+github.com/gabriel-vasile/mimetype v1.4.2/go.mod h1:zApsH/mKG4w07erKIaJPFiX0Tsq9BFQgN3qGY5GnNgA=
github.com/gbrlsnchs/jwt/v3 v3.0.1 h1:lbUmgAKpxnClrKloyIwpxm4OuWeDl5wLk52G91ODPw4=
github.com/gbrlsnchs/jwt/v3 v3.0.1/go.mod h1:AncDcjXz18xetI3A6STfXq2w+LuTx8pQ8bGEwRN8zVM=
github.com/gdamore/encoding v1.0.0 h1:+7OoQ1Bc6eTm5niUzBa0Ctsh6JbMW6Ra+YNuAtDBdko=
@@ -399,6 +411,8 @@ github.com/gin-contrib/sse v0.1.0 h1:Y/yl/+YNO8GZSjAhjMsSuLt29uWRFHdHYUb5lYOV9qE
github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm+fLHvGI=
github.com/gin-gonic/gin v1.6.3 h1:ahKqKTFpO5KTPHxWZjEdPScmYaGtLo8Y4DMHoEsnp14=
github.com/gin-gonic/gin v1.6.3/go.mod h1:75u5sXoLsGZoRN5Sgbi1eraJ4GU3++wFwWzhwvtwp4M=
+github.com/gin-gonic/gin v1.9.1 h1:4idEAncQnU5cB7BeOkPtxjfCSye0AAm1R0RVIqJ+Jmg=
+github.com/gin-gonic/gin v1.9.1/go.mod h1:hPrL7YrpYKXt5YId3A/Tnip5kqbEAP+KLuI3SUcPTeU=
github.com/gliderlabs/ssh v0.1.1/go.mod h1:U7qILu1NlMHj9FlMhZLlkCdDnU1DBEAqr0aevW3Awn0=
github.com/go-check/check v0.0.0-20180628173108-788fd7840127/go.mod h1:9ES+weclKsC9YodN5RgxqK/VD9HM9JsCSh7rNhMZE98=
github.com/go-chi/chi v1.5.4 h1:QHdzF2szwjqVV4wmByUnTcsbIg7UGaQ0tPF2t5GcAIs=
@@ -445,10 +459,16 @@ github.com/go-openapi/swag v0.19.11/go.mod h1:Uc0gKkdR+ojzsEpjh39QChyu92vPgIr72P
github.com/go-playground/assert/v2 v2.0.1/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4=
github.com/go-playground/locales v0.13.0 h1:HyWk6mgj5qFqCT5fjGBuRArbVDfE4hi8+e8ceBS/t7Q=
github.com/go-playground/locales v0.13.0/go.mod h1:taPMhCMXrRLJO55olJkUXHZBHCxTMfnGwq/HNwmWNS8=
+github.com/go-playground/locales v0.14.1 h1:EWaQ/wswjilfKLTECiXz7Rh+3BjFhfDFKv/oXslEjJA=
+github.com/go-playground/locales v0.14.1/go.mod h1:hxrqLVvrK65+Rwrd5Fc6F2O76J/NuW9t0sjnWqG1slY=
github.com/go-playground/universal-translator v0.17.0 h1:icxd5fm+REJzpZx7ZfpaD876Lmtgy7VtROAbHHXk8no=
github.com/go-playground/universal-translator v0.17.0/go.mod h1:UkSxE5sNxxRwHyU+Scu5vgOQjsIJAF8j9muTVoKLVtA=
+github.com/go-playground/universal-translator v0.18.1 h1:Bcnm0ZwsGyWbCzImXv+pAJnYK9S473LQFuzCbDbfSFY=
+github.com/go-playground/universal-translator v0.18.1/go.mod h1:xekY+UJKNuX9WP91TpwSH2VMlDf28Uj24BCp08ZFTUY=
github.com/go-playground/validator/v10 v10.2.0 h1:KgJ0snyC2R9VXYN2rneOtQcw5aHQB1Vv0sFl1UcHBOY=
github.com/go-playground/validator/v10 v10.2.0/go.mod h1:uOYAAleCW8F/7oMFd6aG0GOhaH6EGOAJShg8Id5JGkI=
+github.com/go-playground/validator/v10 v10.14.0 h1:vgvQWe3XCz3gIeFDm/HnTIbj6UGmg/+t63MyGU2n5js=
+github.com/go-playground/validator/v10 v10.14.0/go.mod h1:9iXMNT7sEkjXb0I+enO7QXmzG6QCsPWY4zveKFVRSyU=
github.com/go-sql-driver/mysql v1.4.0/go.mod h1:zAC/RDZ24gD3HViQzih4MyKcchzm+sOG5ZlKdlhCg5w=
github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY=
github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI=
@@ -464,6 +484,8 @@ github.com/gobwas/pool v0.2.0 h1:QEmUOlnSjWtnpRGHF3SauEiOsy82Cup83Vf2LcMlnc8=
github.com/gobwas/pool v0.2.0/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
github.com/gobwas/ws v1.0.2 h1:CoAavW/wd/kulfZmSIBt6p24n4j7tHgNVCjsfHVNUbo=
github.com/gobwas/ws v1.0.2/go.mod h1:szmBTxLgaFppYjEmNtny/v3w89xOydFnnZMcgRRu/EM=
+github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU=
+github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
github.com/godbus/dbus/v5 v5.0.3/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
github.com/godbus/dbus/v5 v5.1.0 h1:4KLkAxT3aOY8Li4FRJe/KvhoNFFxo0m6fNuFUO8QJUk=
@@ -970,6 +992,8 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/leodido/go-urn v1.2.0 h1:hpXL4XnriNwQ/ABnpepYM/1vCLWNDfUNts8dX3xTG6Y=
github.com/leodido/go-urn v1.2.0/go.mod h1:+8+nEpDfqqsY+g338gtMEUOtuK+4dEMhiQEgxpxOKII=
+github.com/leodido/go-urn v1.2.4 h1:XlAE/cm/ms7TE/VMVoduSpNBoyc2dOxHs5MZSwAN63Q=
+github.com/leodido/go-urn v1.2.4/go.mod h1:7ZrI8mTSeBSHl/UaRyKQW1qZeMgak41ANeCNaVckg+4=
github.com/lib/pq v1.10.0 h1:Zx5DJFEYQXio93kgXnQ09fXNiUKsqv4OUEu2UtGcB1E=
github.com/libp2p/go-addr-util v0.0.1/go.mod h1:4ac6O7n9rIAKB1dnd+s8IbbMXkt+oBpzX4/+RACcnlQ=
github.com/libp2p/go-addr-util v0.0.2/go.mod h1:Ecd6Fb3yIuLzq4bD7VcywcVSBtefcAwnUISBM3WG15E=
@@ -1405,7 +1429,10 @@ github.com/pascaldekloe/goe v0.1.0/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144T
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 h1:onHthvaw9LFnH4t2DcNVpwGmV9E1BkGknEliJkfwQj0=
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58/go.mod h1:DXv8WO4yhMYhSNPKjeNKa5WY9YCIEBRbNzFFPJbWO6Y=
github.com/pborman/uuid v1.2.0/go.mod h1:X/NO0urCmaxf9VXbdlT7C2Yzkj2IKimNn4k+gtPdI/k=
+github.com/pelletier/go-toml v1.2.0 h1:T5zMGML61Wp+FlcbWjRDT7yAxhJNAiPPLOFECq181zc=
github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic=
+github.com/pelletier/go-toml/v2 v2.0.8 h1:0ctb6s9mE31h0/lhu+J6OPmVeDxJn+kYnJc2jZR9tGQ=
+github.com/pelletier/go-toml/v2 v2.0.8/go.mod h1:vuYfssBdrU2XDZ9bYydBu6t+6a6PYNcZljzZR9VXg+4=
github.com/performancecopilot/speed v3.0.0+incompatible/go.mod h1:/CLtqpZ5gBg1M9iaPbIdPPGyKcA8hKdoy6hAWba7Yac=
github.com/petar/GoLLRB v0.0.0-20210522233825-ae3b015fd3e9 h1:1/WtZae0yGtPq+TI6+Tv1WTxkukpXeMlviSxvL7SRgk=
github.com/petar/GoLLRB v0.0.0-20210522233825-ae3b015fd3e9/go.mod h1:x3N5drFsm2uilKKuuYo6LdyD8vZAW55sH/9w+pbo1sw=
@@ -1512,6 +1539,10 @@ github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/rwcarlsen/goexif v0.0.0-20190401172101-9e8deecbddbd/go.mod h1:hPqNNc0+uJM6H+SuU8sEs5K5IQeKccPqeSjfgcKGgPk=
github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts=
+github.com/samber/lo v1.38.1 h1:j2XEAqXKb09Am4ebOg31SpvzUTTs6EN3VfgeLUhPdXM=
+github.com/samber/lo v1.38.1/go.mod h1:+m/ZKRl6ClXCE2Lgf3MsQlWfh4bn1bz6CXEOxnEXnEA=
+github.com/samuel/go-opencl v0.0.0-20171108220231-cbcfd10c32ad h1:zyvTnsJPPAqVg2v3bbvTI+RdbVPJufZ+CWCPOX0Dtp8=
+github.com/samuel/go-opencl v0.0.0-20171108220231-cbcfd10c32ad/go.mod h1:KCqoxhWgoxCWg13iOq53YFf50jlonuuhIpO916aWEkg=
github.com/samuel/go-zookeeper v0.0.0-20190923202752-2cc03de413da/go.mod h1:gi+0XIa01GRL2eRQVjQkKGqKF3SF9vZR/HnPullcV2E=
github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc=
github.com/sercand/kuberesolver v2.4.0+incompatible h1:WE2OlRf6wjLxHwNkkFLQGaZcVLEXjMjBPjjEU5vksH8=
@@ -1598,6 +1629,8 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
github.com/stretchr/testify v1.7.2/go.mod h1:R6va5+xMeoiuVRoj+gSkQ7d3FALtqAAGI1FQKckRals=
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
+github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
+github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/stvp/go-udp-testing v0.0.0-20201019212854-469649b16807/go.mod h1:7jxmlfBCDBXRzr0eAQJ48XC1hBu1np4CS5+cHEYfwpc=
@@ -1618,6 +1651,8 @@ github.com/tidwall/pretty v1.2.0 h1:RWIZEg2iJ8/g6fDDYzMpobmaoGh5OLl4AXtGUGPcqCs=
github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
github.com/tmc/grpc-websocket-proxy v0.0.0-20170815181823-89b8d40f7ca8/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U=
github.com/tv42/httpunix v0.0.0-20150427012821-b75d8614f926/go.mod h1:9ESjWnEqriFuLhtthL60Sar/7RFoluCcXsuvEwTV5KM=
+github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS4MhqMhdFk5YI=
+github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
github.com/twmb/murmur3 v1.1.6 h1:mqrRot1BRxm+Yct+vavLMou2/iJt0tNVTTC0QoIjaZg=
github.com/twmb/murmur3 v1.1.6/go.mod h1:Qq/R7NUyOfr65zD+6Q5IHKsJLwP7exErjN6lyyq3OSQ=
github.com/uber/jaeger-client-go v2.30.0+incompatible h1:D6wyKGCecFaSRUpo8lCVbaOOb6ThwMmTEbhRwtKR97o=
@@ -1628,6 +1663,8 @@ github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljT
github.com/ugorji/go/codec v1.1.7/go.mod h1:Ax+UKWsSmolVDwsd+7N3ZtXu+yMGCf907BLYF3GoBXY=
github.com/ugorji/go/codec v1.2.6 h1:7kbGefxLoDBuYXOms4yD7223OpNMMPNPZxXk5TvFcyQ=
github.com/ugorji/go/codec v1.2.6/go.mod h1:V6TCNZ4PHqoHGFZuSG1W8nrCzzdgA2DozYxWFFpvxTw=
+github.com/ugorji/go/codec v1.2.11 h1:BMaWp1Bb6fHwEtbplGBGJ498wD+LKlNSl25MjdZY4dU=
+github.com/ugorji/go/codec v1.2.11/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg=
github.com/urfave/cli v1.20.0/go.mod h1:70zkFmudgCuE/ngEzBv17Jvp/497gISqfk5gWijbERA=
github.com/urfave/cli v1.22.1/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0=
github.com/urfave/cli v1.22.2/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0=
@@ -1789,6 +1826,9 @@ go4.org v0.0.0-20180809161055-417644f6feb5/go.mod h1:MkTOUMDaeVYJUOUsaDXIhWPZYa1
go4.org v0.0.0-20200411211856-f5505b9728dd/go.mod h1:CIiUVy99QCPfoE13bO4EZaz5GZMZXMSBGhxRdsvzbkg=
go4.org v0.0.0-20230225012048-214862532bf5 h1:nifaUDeh+rPaBCMPMQHZmvJf+QdpLFnuQPwx+LxVmtc=
go4.org v0.0.0-20230225012048-214862532bf5/go.mod h1:F57wTi5Lrj6WLyswp5EYV1ncrEbFGHD4hhz6S1ZYeaU=
+golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
+golang.org/x/arch v0.3.0 h1:02VY4/ZcO/gBOH6PUaoiptASxtXU10jazRCP865E97k=
+golang.org/x/arch v0.3.0/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
golang.org/x/build v0.0.0-20190111050920-041ab4dc3f9d/go.mod h1:OWs+y06UdEOHN4y+MfF/py+xQ/tYqIWW03b70/CG9Rw=
golang.org/x/crypto v0.0.0-20170930174604-9419663f5a44/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
@@ -2066,6 +2106,8 @@ golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.9.0 h1:KS/R3tvhPqvJvwcKfnBHJwwthS11LRhmM5D59eEXa0s=
golang.org/x/sys v0.9.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.10.0 h1:SqMFp9UcQJZa+pmYuAKjd9xq1f0j5rLcDIk0mj4qAsA=
+golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20201210144234-2321bbc49cbf/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
@@ -2319,6 +2361,7 @@ lukechampine.com/blake3 v1.2.1/go.mod h1:0OFRp7fBtAylGVCO40o87sbupkyIGgbpv1+M1k1
nhooyr.io/websocket v1.8.7 h1:usjR2uOr/zjjkVMy0lW+PPohFok7PCow5sDjLgX4P4g=
nhooyr.io/websocket v1.8.7/go.mod h1:B70DZP8IakI65RVQ51MsWP/8jndNma26DVA/nFSCgW0=
rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8=
+rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0=
rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA=
sigs.k8s.io/yaml v1.1.0/go.mod h1:UJmg0vDUVViEyp3mgSv9WPwZCDxu4rQW1olrI1uml+o=
diff --git a/itests/harmonytask_test.go b/itests/harmonytask_test.go
new file mode 100644
index 000000000..2c8523d82
--- /dev/null
+++ b/itests/harmonytask_test.go
@@ -0,0 +1,247 @@
+package itests
+
+import (
+ "context"
+ "errors"
+ "fmt"
+ "sort"
+ "strings"
+ "sync"
+ "testing"
+ "time"
+
+ "github.com/filecoin-project/lotus/itests/kit"
+ "github.com/filecoin-project/lotus/lib/harmony/harmonydb"
+ "github.com/filecoin-project/lotus/lib/harmony/harmonytask"
+ "github.com/filecoin-project/lotus/lib/harmony/resources"
+ "github.com/filecoin-project/lotus/node/impl"
+ "github.com/stretchr/testify/require"
+)
+
+type task1 struct {
+ toAdd []int
+ myPersonalTableLock sync.Mutex
+ myPersonalTable map[harmonytask.TaskID]int // This would typicallyb be a DB table
+ WorkCompleted []string
+}
+
+func (t *task1) Do(tID harmonytask.TaskID, stillOwned func() bool) (done bool, err error) {
+ if !stillOwned() {
+ return false, errors.New("Why not still owned?")
+ }
+ t.myPersonalTableLock.Lock()
+ defer t.myPersonalTableLock.Unlock()
+ t.WorkCompleted = append(t.WorkCompleted, fmt.Sprintf("taskResult%d", t.myPersonalTable[tID]))
+ return true, nil
+}
+func (t *task1) CanAccept(list []harmonytask.TaskID) (*harmonytask.TaskID, error) {
+ return &list[0], nil
+}
+func (t *task1) TypeDetails() harmonytask.TaskTypeDetails {
+ return harmonytask.TaskTypeDetails{
+ Max: 100,
+ Name: "ThingOne",
+ MaxFailures: 1,
+ Cost: resources.Resources{
+ Cpu: 1,
+ Ram: 100 << 10, // at 100kb, it's tiny
+ },
+ }
+}
+func (t *task1) Adder(add harmonytask.AddTaskFunc) {
+ for _, v := range t.toAdd {
+ add(func(tID harmonytask.TaskID, tx *harmonydb.Tx) bool {
+ t.myPersonalTableLock.Lock()
+ defer t.myPersonalTableLock.Unlock()
+
+ t.myPersonalTable[tID] = v
+ return true
+ })
+ }
+}
+
+func TestHarmonyTasks(t *testing.T) {
+ withSetup(t, func(m *kit.TestMiner) {
+ cdb := m.BaseAPI.(*impl.StorageMinerAPI).HarmonyDB
+ t1 := &task1{
+ toAdd: []int{56, 73},
+ myPersonalTable: map[harmonytask.TaskID]int{},
+ }
+ e, err := harmonytask.New(cdb, []harmonytask.TaskInterface{t1}, "test:1")
+ require.NoError(t, err)
+ time.Sleep(3 * time.Second) // do the work. FLAKYNESS RISK HERE.
+ e.GracefullyTerminate(time.Minute)
+ require.Equal(t, t1.WorkCompleted, 2, "wrong amount of work complete: expected 2 got:")
+ sort.Strings(t1.WorkCompleted)
+ got := strings.Join(t1.WorkCompleted, ",")
+ expected := "taskResult56,taskResult73"
+ if got != expected {
+ t.Fatal("Unexpected results! Wanted " + expected + " got " + got)
+ }
+ // TODO test history table looks right.
+ })
+}
+
+type passthru struct {
+ dtl harmonytask.TaskTypeDetails
+ do func(tID harmonytask.TaskID, stillOwned func() bool) (done bool, err error)
+ canAccept func(list []harmonytask.TaskID) (*harmonytask.TaskID, error)
+ adder func(add harmonytask.AddTaskFunc)
+}
+
+func (t *passthru) Do(tID harmonytask.TaskID, stillOwned func() bool) (done bool, err error) {
+ return t.do(tID, stillOwned)
+}
+func (t *passthru) CanAccept(list []harmonytask.TaskID) (*harmonytask.TaskID, error) {
+ return t.canAccept(list)
+}
+func (t *passthru) TypeDetails() harmonytask.TaskTypeDetails {
+ return t.dtl
+}
+func (t *passthru) Adder(add harmonytask.AddTaskFunc) {
+ if t.adder != nil {
+ t.adder(add)
+ }
+}
+
+// Common stuff
+var dtl = harmonytask.TaskTypeDetails{Name: "foo", Max: -1, Cost: resources.Resources{}}
+var letters []string
+var lettersMutex sync.Mutex
+
+func fooLetterAdder(t *testing.T, cdb *harmonydb.DB) *passthru {
+ return &passthru{
+ dtl: dtl,
+ canAccept: func(list []harmonytask.TaskID) (*harmonytask.TaskID, error) { return nil, nil },
+ adder: func(add harmonytask.AddTaskFunc) {
+ for _, v := range []string{"A", "B"} {
+ add(func(tID harmonytask.TaskID, tx *harmonydb.Tx) bool {
+ _, err := tx.Exec("INSERT INTO itest_scratch (some_int, content) VALUES ($1,$2)", tID, v)
+ require.NoError(t, err)
+ return true
+ })
+ }
+ },
+ }
+}
+func fooLetterSaver(t *testing.T, cdb *harmonydb.DB) *passthru {
+ return &passthru{
+ dtl: dtl,
+ canAccept: func(list []harmonytask.TaskID) (*harmonytask.TaskID, error) { return &list[0], nil },
+ do: func(tID harmonytask.TaskID, stillOwned func() bool) (done bool, err error) {
+ var content string
+ err = cdb.QueryRow(context.Background(),
+ "SELECT content FROM itest_scratch WHERE some_int=$1", tID).Scan(&content)
+ require.NoError(t, err)
+ lettersMutex.Lock()
+ defer lettersMutex.Unlock()
+ letters = append(letters, content)
+ return true, nil
+ },
+ }
+}
+
+func TestHarmonyTasksWith2PartiesPolling(t *testing.T) {
+ withSetup(t, func(m *kit.TestMiner) {
+ cdb := m.BaseAPI.(*impl.StorageMinerAPI).HarmonyDB
+ senderParty := fooLetterAdder(t, cdb)
+ workerParty := fooLetterSaver(t, cdb)
+ harmonytask.POLL_DURATION = time.Millisecond * 100
+ sender, err := harmonytask.New(cdb, []harmonytask.TaskInterface{senderParty}, "test:1")
+ require.NoError(t, err)
+ worker, err := harmonytask.New(cdb, []harmonytask.TaskInterface{workerParty}, "test:2")
+ require.NoError(t, err)
+ time.Sleep(3 * time.Second) // do the work. FLAKYNESS RISK HERE.
+ sender.GracefullyTerminate(time.Second * 5)
+ worker.GracefullyTerminate(time.Second * 5)
+ sort.Strings(letters)
+ require.Equal(t, letters, []string{"A", "B"})
+ })
+}
+
+func TestWorkStealing(t *testing.T) {
+ withSetup(t, func(m *kit.TestMiner) {
+ cdb := m.BaseAPI.(*impl.StorageMinerAPI).HarmonyDB
+ ctx := context.Background()
+
+ // The dead worker will be played by a few SQL INSERTS.
+ _, err := cdb.Exec(ctx, `INSERT INTO harmony_machines
+ (id, last_contact,host_and_port, cpu, ram, gpu, gpuram)
+ VALUES (300, DATE '2000-01-01', 'test:1', 4, 400000, 1, 1000000)`)
+ require.ErrorIs(t, err, nil)
+ _, err = cdb.Exec(ctx, `INSERT INTO harmony_task
+ (id, name, owner_id, posted_time, added_by)
+ VALUES (1234, 'foo', 300, DATE '2000-01-01', 300)`)
+ require.ErrorIs(t, err, nil)
+ _, err = cdb.Exec(ctx, "INSERT INTO itest_scratch (some_int, content) VALUES (1234, 'M')")
+ require.ErrorIs(t, err, nil)
+
+ harmonytask.POLL_DURATION = time.Millisecond * 100
+ harmonytask.CLEANUP_FREQUENCY = time.Millisecond * 100
+ worker, err := harmonytask.New(cdb, []harmonytask.TaskInterface{fooLetterSaver(t, cdb)}, "test:2")
+ require.ErrorIs(t, err, nil)
+ time.Sleep(3 * time.Second) // do the work. FLAKYNESS RISK HERE.
+ worker.GracefullyTerminate(time.Second * 5)
+ require.Equal(t, []string{"M"}, letters)
+ })
+}
+
+func TestTaskRetry(t *testing.T) {
+ withSetup(t, func(m *kit.TestMiner) {
+ cdb := m.BaseAPI.(*impl.StorageMinerAPI).HarmonyDB
+ senderParty := fooLetterAdder(t, cdb)
+ harmonytask.POLL_DURATION = time.Millisecond * 100
+ sender, err := harmonytask.New(cdb, []harmonytask.TaskInterface{senderParty}, "test:1")
+ require.NoError(t, err)
+
+ alreadyFailed := map[string]bool{}
+ fails2xPerMsg := &passthru{
+ dtl: dtl,
+ canAccept: func(list []harmonytask.TaskID) (*harmonytask.TaskID, error) { return &list[0], nil },
+ do: func(tID harmonytask.TaskID, stillOwned func() bool) (done bool, err error) {
+ var content string
+ err = cdb.QueryRow(context.Background(),
+ "SELECT content FROM itest_scratch WHERE some_int=$1", tID).Scan(&content)
+ require.NoError(t, err)
+ lettersMutex.Lock()
+ defer lettersMutex.Unlock()
+ if !alreadyFailed[content] {
+ alreadyFailed[content] = true
+ return false, errors.New("intentional 'error'")
+ }
+ letters = append(letters, content)
+ return true, nil
+ },
+ }
+ rcv, err := harmonytask.New(cdb, []harmonytask.TaskInterface{fails2xPerMsg}, "test:2")
+ require.NoError(t, err)
+ time.Sleep(3 * time.Second)
+ sender.GracefullyTerminate(time.Hour)
+ rcv.GracefullyTerminate(time.Hour)
+ sort.Strings(letters)
+ require.Equal(t, []string{"A", "B"}, letters)
+ type hist struct {
+ TaskID int
+ Result bool
+ Err string
+ }
+ var res []hist
+ require.NoError(t, cdb.Select(context.Background(), &res,
+ `SELECT task_id, result, err FROM harmony_task_history
+ ORDER BY result DESC, task_id`))
+
+ require.Equal(t, []hist{
+ {1, true, ""},
+ {2, true, ""},
+ {1, false, "error: intentional 'error'"},
+ {2, false, "error: intentional 'error'"}}, res)
+ })
+}
+
+/*
+FUTURE test fast-pass round-robin via http calls (3party) once the API for that is set
+It's necessary for WinningPoSt.
+
+FUTURE test follows.
+It's necessary for sealing work.
+*/
diff --git a/lib/harmony/harmonydb/harmonydb.go b/lib/harmony/harmonydb/harmonydb.go
index fd31e7a13..48e3db6fa 100644
--- a/lib/harmony/harmonydb/harmonydb.go
+++ b/lib/harmony/harmonydb/harmonydb.go
@@ -118,21 +118,25 @@ type tracer struct {
type ctxkey string
-var sqlStart = ctxkey("sqlStart")
+const SQL_START = ctxkey("sqlStart")
+const SQL_STRING = ctxkey("sqlString")
func (t tracer) TraceQueryStart(ctx context.Context, conn *pgx.Conn, data pgx.TraceQueryStartData) context.Context {
- return context.WithValue(ctx, sqlStart, time.Now())
+ return context.WithValue(context.WithValue(ctx, SQL_START, time.Now()), SQL_STRING, data.SQL)
}
func (t tracer) TraceQueryEnd(ctx context.Context, conn *pgx.Conn, data pgx.TraceQueryEndData) {
DBMeasures.Hits.M(1)
- ms := time.Since(ctx.Value(sqlStart).(time.Time)).Milliseconds()
+ ms := time.Since(ctx.Value(SQL_START).(time.Time)).Milliseconds()
DBMeasures.TotalWait.M(ms)
DBMeasures.Waits.Observe(float64(ms))
if data.Err != nil {
DBMeasures.Errors.M(1)
}
- // Can log what type of query it is, but not what tables
- // Can log rows affected.
+ logger.Debugw("SQL run",
+ "query", ctx.Value(SQL_STRING).(string),
+ "err", data.Err,
+ "rowCt", data.CommandTag.RowsAffected(),
+ "milliseconds", ms)
}
// addStatsAndConnect connects a prometheus logger. Be sure to run this before using the DB.
@@ -250,8 +254,9 @@ func (db *DB) upgrade() error {
}
_, err = db.pgx.Exec(context.Background(), s)
if err != nil {
- db.log(fmt.Sprintf("Could not upgrade! File %s, Query: %s, Returned: %s", name, s, err.Error()))
- return err
+ msg := fmt.Sprintf("Could not upgrade! File %s, Query: %s, Returned: %s", name, s, err.Error())
+ db.log(msg)
+ return errors.New(msg) // makes devs lives easier by placing message at the end.
}
}
diff --git a/lib/harmony/harmonydb/sql/20230706.sql b/lib/harmony/harmonydb/sql/20230706.sql
index b45aca7fa..a4a333b81 100644
--- a/lib/harmony/harmonydb/sql/20230706.sql
+++ b/lib/harmony/harmonydb/sql/20230706.sql
@@ -2,5 +2,6 @@ CREATE TABLE itest_scratch (
id SERIAL PRIMARY KEY,
content TEXT,
some_int INTEGER,
+ second_int INTEGER,
update_time TIMESTAMP DEFAULT current_timestamp
)
\ No newline at end of file
diff --git a/lib/harmony/harmonydb/sql/20230719.sql b/lib/harmony/harmonydb/sql/20230719.sql
new file mode 100644
index 000000000..0a676526b
--- /dev/null
+++ b/lib/harmony/harmonydb/sql/20230719.sql
@@ -0,0 +1,52 @@
+/* For HarmonyTask base implementation. */
+
+CREATE TABLE harmony_machines (
+ id SERIAL PRIMARY KEY NOT NULL,
+ last_contact TIMESTAMP NOT NULL DEFAULT current_timestamp,
+ host_and_port varchar(300) NOT NULL,
+ cpu INTEGER NOT NULL,
+ ram BIGINT NOT NULL,
+ gpu FLOAT NOT NULL,
+ gpuram BIGINT NOT NULL
+);
+
+CREATE TABLE harmony_task (
+ id SERIAL PRIMARY KEY NOT NULL,
+ initiated_by INTEGER,
+ update_time TIMESTAMP NOT NULL DEFAULT current_timestamp,
+ posted_time TIMESTAMP NOT NULL,
+ owner_id INTEGER REFERENCES harmony_machines (id) ON DELETE SET NULL,
+ added_by INTEGER NOT NULL,
+ previous_task INTEGER,
+ name varchar(8) NOT NULL
+);
+COMMENT ON COLUMN harmony_task.initiated_by IS 'The task ID whose completion occasioned this task.';
+COMMENT ON COLUMN harmony_task.owner_id IS 'The foreign key to harmony_machines.';
+COMMENT ON COLUMN harmony_task.name IS 'The name of the task type.';
+COMMENT ON COLUMN harmony_task.owner_id IS 'may be null if between owners or not yet taken';
+COMMENT ON COLUMN harmony_task.update_time IS 'When it was last modified. not a heartbeat';
+
+CREATE TABLE harmony_task_history (
+ id SERIAL PRIMARY KEY NOT NULL,
+ task_id INTEGER NOT NULL,
+ name VARCHAR(8) NOT NULL,
+ posted TIMESTAMP NOT NULL,
+ work_start TIMESTAMP NOT NULL,
+ work_end TIMESTAMP NOT NULL,
+ result BOOLEAN NOT NULL,
+ err varchar
+);
+COMMENT ON COLUMN harmony_task_history.result IS 'Use to detemine if this was a successful run.';
+
+CREATE TABLE harmony_task_follow (
+ id SERIAL PRIMARY KEY NOT NULL,
+ owner_id INTEGER NOT NULL REFERENCES harmony_machines (id) ON DELETE CASCADE,
+ to_type VARCHAR(8) NOT NULL,
+ from_type VARCHAR(8) NOT NULL
+);
+
+CREATE TABLE harmony_task_impl (
+ id SERIAL PRIMARY KEY NOT NULL,
+ owner_id INTEGER NOT NULL REFERENCES harmony_machines (id) ON DELETE CASCADE,
+ name VARCHAR(8) NOT NULL
+);
\ No newline at end of file
diff --git a/lib/harmony/harmonytask/doc.go b/lib/harmony/harmonytask/doc.go
new file mode 100644
index 000000000..357c3e15c
--- /dev/null
+++ b/lib/harmony/harmonytask/doc.go
@@ -0,0 +1,79 @@
+/*
+ Package harmomnytask implements a pure (no task logic), distributed
+ task manager. This clean interface allows a task implementer to completely
+
+avoid being concerned with task scheduling and management.
+It's based on the idea of tasks as small units of work broken from other
+work by hardware, parallelizabilty, reliability, or any other reason.
+Workers will be Greedy: vaccuuming up their favorite jobs from a list.
+Once 1 task is accepted, harmonydb tries to get other task runner
+machines to accept work (round robin) before trying again to accept.
+*
+Mental Model:
+
+ Things that block tasks:
+ - task not registered for any running server
+ - max was specified and reached
+ - resource exhaustion
+ - CanAccept() interface (per-task implmentation) does not accept it.
+ Ways tasks start: (slowest first)
+ - DB Read every 1 minute
+ - Bump via HTTP if registered in DB
+ - Task was added (to db) by this process
+ Ways tasks get added:
+ - Async Listener task (for chain, etc)
+ - Followers: Tasks get added because another task completed
+ When Follower collectors run:
+ - If both sides are process-local, then
+ - Otherwise, at the listen interval during db scrape
+ How duplicate tasks are avoided:
+ - that's up to the task definition, but probably a unique key
+
+*
+To use:
+1.Implement TaskInterface for a new task.
+2 Have New() receive this & all other ACTIVE implementations.
+*
+*
+As we are not expecting DBAs in this database, it's important to know
+what grows uncontrolled. The only harmony_* table is _task_history
+(somewhat quickly) and harmony_machines (slowly). These will need a
+clean-up for after the task data could never be acted upon.
+but the design **requires** extraInfo tables to grow until the task's
+info could not possibly be used by a following task, including slow
+release rollout. This would normally be in the order of months old.
+*
+Other possible enhancements include more collaboative coordination
+to assign a task to machines closer to the data.
+
+__Database_Behavior__
+harmony_task is the list of work that has not been completed.
+
+ AddTaskFunc manages the additions, but is designed to have its
+ transactions failed-out on overlap with a similar task already written.
+ It's up to the TaskInterface implementer to discover this overlap via
+ some other table it uses (since overlap can mean very different things).
+
+harmony_task_history
+
+ This holds transactions that completed or saw too many retries. It also
+ serves as input for subsequent (follower) tasks to kick off. This is not
+ done machine-internally because a follower may not be on the same machine
+ as the previous task.
+
+harmony_task_machines
+
+ Managed by lib/harmony/resources, this is a reference to machines registered
+ via the resources. This registration does not obligate the machine to
+ anything, but serves as a discovery mechanism. Paths are hostnames + ports
+ which are presumed to support http, but this assumption is only used by
+ the task system.
+
+harmony_task_follow / harmony_task_impl
+
+ These tables are used to fast-path notifications to other machines instead
+ of waiting for polling. _impl helps round-robin work pick-up. _follow helps
+ discover the machines that are interested in creating tasks following the
+ task that just completed.
+*/
+package harmonytask
diff --git a/lib/harmony/harmonytask/harmonytask.go b/lib/harmony/harmonytask/harmonytask.go
new file mode 100644
index 000000000..1f5662959
--- /dev/null
+++ b/lib/harmony/harmonytask/harmonytask.go
@@ -0,0 +1,386 @@
+package harmonytask
+
+import (
+ "context"
+ "fmt"
+ "strconv"
+ "sync/atomic"
+ "time"
+
+ "github.com/filecoin-project/lotus/lib/harmony/resources"
+ "github.com/gin-gonic/gin"
+
+ "github.com/filecoin-project/lotus/lib/harmony/harmonydb"
+)
+
+// Consts (except for unit test)
+var POLL_DURATION = time.Minute // Poll for Work this frequently
+var CLEANUP_FREQUENCY = 5 * time.Minute // Check for dead workers this often * everyone
+
+type TaskTypeDetails struct {
+ // Max returns how many tasks this machine can run of this type.
+ // Negative means unrestricted.
+ Max int
+
+ // Name is the task name to be added to the task list.
+ Name string
+
+ // Peak costs to Do() the task.
+ Cost resources.Resources
+
+ // Max Failure count before the job is dropped.
+ // 0 = retry forever
+ MaxFailures uint
+
+ // Follow another task's completion via this task's creation.
+ // The function should populate extraInfo from data
+ // available from the previous task's tables, using the given TaskID.
+ // It should also return success if the trigger succeeded.
+ // NOTE: if refatoring tasks, see if your task is
+ // necessary. Ex: Is the sector state correct for your stage to run?
+ Follows map[string]func(TaskID, AddTaskFunc) bool
+}
+
+// TaskInterface must be implemented in order to have a task used by harmonytask.
+type TaskInterface interface {
+ // Do the task assigned. Call stillOwned before making single-writer-only
+ // changes to ensure the work has not been stolen.
+ // This is the ONLY function that should attempt to do the work, and must
+ // ONLY be called by harmonytask.
+ // Indicate if the task no-longer needs scheduling with done=true including
+ // cases where it's past the deadline.
+ Do(taskID TaskID, stillOwned func() bool) (done bool, err error)
+
+ // CanAccept should return if the task can run on this machine. It should
+ // return null if the task type is not allowed on this machine.
+ // It should select the task it most wants to accomplish.
+ // It is also responsible for determining disk space (including scratch).
+ CanAccept([]TaskID) (*TaskID, error)
+
+ // TypeDetails() returns static details about how this task behaves and
+ // how this machine will run it. Read once at the beginning.
+ TypeDetails() TaskTypeDetails
+
+ // This listener will consume all external sources continuously for work.
+ // Do() may also be called from a backlog of work. This must not
+ // start doing the work (it still must be scheduled).
+ // Note: Task de-duplication should happen in ExtraInfoFunc by
+ // returning false, typically by determining from the tx that the work
+ // exists already. The easy way is to have a unique joint index
+ // across all fields that will be common.
+ // Adder should typically only add its own task type, but multiple
+ // is possible for when 1 trigger starts 2 things.
+ // Usage Example:
+ // func (b *BazType)Adder(addTask AddTaskFunc) {
+ // for {
+ // bazMaker := <- bazChannel
+ // addTask("baz", func(t harmonytask.TaskID, txn db.Transaction) bool {
+ // _, err := txn.Exec(`INSERT INTO bazInfoTable (taskID, qix, mot)
+ // VALUES ($1,$2,$3)`, id, bazMaker.qix, bazMaker.mot)
+ // if err != nil {
+ // scream(err)
+ // return false
+ // }
+ // return true
+ // })
+ // }
+ // }
+ Adder(AddTaskFunc)
+}
+
+type AddTaskFunc func(extraInfo func(TaskID, *harmonydb.Tx) bool)
+
+type TaskEngine struct {
+ ctx context.Context
+ handlers []*taskTypeHandler
+ db *harmonydb.DB
+ workAdderMutex *notifyingMx
+ reg *resources.Reg
+ grace context.CancelFunc
+ taskMap map[string]*taskTypeHandler
+ ownerID int
+ tryAllWork chan bool // notify if work completed
+ follows map[string][]followStruct
+ lastFollowTime time.Time
+ lastCleanup atomic.Value
+}
+type followStruct struct {
+ f func(TaskID, AddTaskFunc) bool
+ h *taskTypeHandler
+}
+
+type TaskID int
+
+// New creates all the task definitions. Note that TaskEngine
+// knows nothing about the tasks themselves and serves to be a
+// generic container for common work
+func New(
+ db *harmonydb.DB,
+ impls []TaskInterface,
+ hostnameAndPort string) (*TaskEngine, error) {
+
+ reg, err := resources.Register(db, hostnameAndPort)
+ if err != nil {
+ return nil, fmt.Errorf("cannot get resources: %w", err)
+ }
+ ctx, grace := context.WithCancel(context.Background())
+ e := &TaskEngine{
+ ctx: ctx,
+ grace: grace,
+ db: db,
+ reg: reg,
+ ownerID: reg.Resources.MachineID, // The current number representing "hostAndPort"
+ workAdderMutex: &notifyingMx{},
+ taskMap: make(map[string]*taskTypeHandler, len(impls)),
+ tryAllWork: make(chan bool),
+ follows: make(map[string][]followStruct),
+ }
+ e.lastCleanup.Store(time.Now())
+ for _, c := range impls {
+ h := taskTypeHandler{
+ TaskInterface: c,
+ TaskTypeDetails: c.TypeDetails(),
+ TaskEngine: e,
+ }
+ e.handlers = append(e.handlers, &h)
+ e.taskMap[h.TaskTypeDetails.Name] = &h
+
+ _, err := db.Exec(e.ctx, `INSERT INTO harmony_task_impl (owner_id, name)
+ VALUES ($1,$2)`, e.ownerID, h.Name)
+ if err != nil {
+ return nil, fmt.Errorf("can't update impl: %w", err)
+ }
+
+ for name, fn := range c.TypeDetails().Follows {
+ e.follows[name] = append(e.follows[name], followStruct{fn, &h})
+
+ // populate harmony_task_follows
+ _, err := db.Exec(e.ctx, `INSERT INTO harmony_task_follows (owner_id, from_task, to_task)
+ VALUES ($1,$2,$3)`, e.ownerID, name, h.Name)
+ if err != nil {
+ return nil, fmt.Errorf("can't update harmony_task_follows: %w", err)
+ }
+ }
+ }
+
+ // resurrect old work
+ {
+ var taskRet []struct {
+ ID int
+ Name string
+ }
+
+ err := db.Select(e.ctx, &taskRet, `SELECT id, name from harmony_task WHERE owner_id=$1`, e.ownerID)
+ if err != nil {
+ return nil, err
+ }
+ for _, w := range taskRet {
+ // edge-case: if old assignments are not available tasks, unlock them.
+ h := e.taskMap[w.Name]
+ if h == nil {
+ _, err := db.Exec(e.ctx, `UPDATE harmony_task SET owner=NULL WHERE id=$1`, w.ID)
+ if err != nil {
+ log.Error("Cannot remove self from owner field: ", err)
+ continue // not really fatal, but not great
+ }
+ }
+ if !h.considerWork([]TaskID{TaskID(w.ID)}) {
+ log.Error("Strange: Unable to accept previously owned task: ", w.ID, w.Name)
+ }
+ }
+ }
+ for _, h := range e.handlers {
+ go h.Adder(h.AddTask)
+ }
+ go e.poller()
+
+ return e, nil
+}
+
+// GracefullyTerminate hangs until all present tasks have completed.
+// Call this to cleanly exit the process. As some processes are long-running,
+// passing a deadline will ignore those still running (to be picked-up later).
+func (e *TaskEngine) GracefullyTerminate(deadline time.Duration) {
+ e.grace()
+ e.reg.Shutdown()
+ deadlineChan := time.NewTimer(deadline).C
+
+ // block bumps & follows by unreg from DBs.
+ _, err := e.db.Exec(context.Background(), `DELETE FROM harmony_task_impl WHERE owner_id=$1`, e.ownerID)
+ if err != nil {
+ log.Warn("Could not clean-up impl table: %w", err)
+ }
+ _, err = e.db.Exec(context.Background(), `DELETE FROM harmony_task_follow WHERE owner_id=$1`, e.ownerID)
+ if err != nil {
+ log.Warn("Could not clean-up impl table: %w", err)
+ }
+top:
+ for _, h := range e.handlers {
+ if h.Count.Load() > 0 {
+ select {
+ case <-deadlineChan:
+ return
+ default:
+ time.Sleep(time.Millisecond)
+ goto top
+ }
+ }
+ }
+}
+
+func (e *TaskEngine) poller() {
+ for {
+ select {
+ case <-e.tryAllWork: ///////////////////// Find work after some work finished
+ case <-time.NewTicker(POLL_DURATION).C: // Find work periodically
+ case <-e.ctx.Done(): ///////////////////// Graceful exit
+ return
+ }
+ e.followWorkInDB() // "Follows" the slow way
+ e.pollerTryAllWork() // "Bumps" (round robin tasks) the slow way
+ }
+}
+
+// followWorkInDB implements "Follows" the slow way
+func (e *TaskEngine) followWorkInDB() {
+ // Step 1: What are we following?
+ var lastFollowTime time.Time
+ lastFollowTime, e.lastFollowTime = e.lastFollowTime, time.Now()
+
+ for from_name, srcs := range e.follows {
+ var cList []int // Which work is done (that we follow) since we last checked?
+ err := e.db.Select(e.ctx, &cList, `SELECT h.task_id FROM harmony_task_history
+ WHERE h.work_end>$1 AND h.name=$2`, lastFollowTime, from_name)
+ if err != nil {
+ log.Error("Could not query DB: ", err)
+ return
+ }
+ for _, src := range srcs {
+ for _, workAlreadyDone := range cList { // Were any tasks made to follow these tasks?
+ var ct int
+ err := e.db.QueryRow(e.ctx, `SELECT COUNT(*) FROM harmony_task
+ WHERE name=$1 AND previous_task=$2`, src.h.Name, workAlreadyDone).Scan(&ct)
+ if err != nil {
+ log.Error("Could not query harmony_task: ", err)
+ return // not recoverable here
+ }
+ if ct > 0 {
+ continue
+ }
+ // we need to create this task
+ if !src.h.Follows[from_name](TaskID(workAlreadyDone), src.h.AddTask) {
+ // But someone may have beaten us to it.
+ log.Infof("Unable to add task %s following Task(%d, %s)", src.h.Name, workAlreadyDone, from_name)
+ }
+ }
+ }
+ }
+}
+
+// pollerTryAllWork implements "Bumps" (next task) the slow way
+func (e *TaskEngine) pollerTryAllWork() {
+ if time.Since(e.lastCleanup.Load().(time.Time)) > CLEANUP_FREQUENCY {
+ e.lastCleanup.Store(time.Now())
+ resources.CleanupMachines(e.ctx, e.db)
+ }
+ for _, v := range e.handlers {
+ rerun:
+ if v.AssertMachineHasCapacity() != nil {
+ continue
+ }
+ var unownedTasks []TaskID
+ err := e.db.Select(e.ctx, &unownedTasks, `SELECT id
+ FROM harmony_task
+ WHERE owner_id IS NULL AND name=$1
+ ORDER BY update_time`, v.Name)
+ if err != nil {
+ log.Error("Unable to read work ", err)
+ continue
+ }
+ accepted := v.considerWork(unownedTasks)
+ if !accepted {
+ log.Warn("Work not accepted")
+ continue
+ }
+ if len(unownedTasks) > 1 {
+ e.bump(v.Name) // wait for others before trying again to add work.
+ goto rerun
+ }
+ }
+}
+
+// AddHttpHandlers TODO this needs to be called by the http server to register routes.
+// This implements the receiver-side of "follows" and "bumps" the fast way.
+func (e *TaskEngine) AddHttpHandlers(root gin.IRouter) {
+ s := root.Group("/scheduler/")
+ f := s.Group("/follows")
+ for name, v := range e.follows {
+ f.GET("/"+name+"/:tID", func(c *gin.Context) {
+ tIDString := c.Param("tID")
+ tID, err := strconv.Atoi(tIDString)
+ if err != nil {
+ c.AbortWithError(401, err)
+ return
+ }
+ taskAdded := false
+ for _, v := range v {
+ taskAdded = taskAdded || v.f(TaskID(tID), v.h.AddTask)
+ }
+ if taskAdded {
+ e.tryAllWork <- true
+ c.Status(200)
+ }
+ c.Status(202) // NOTE: 202 for "accepted" but not worked.
+ })
+ }
+ b := s.Group("/bump")
+ for _, h := range e.handlers {
+ b.GET("/"+h.Name+"/:tID", func(c *gin.Context) {
+ tIDString := c.Param("tID")
+ tID, err := strconv.Atoi(tIDString)
+ if err != nil {
+ c.AbortWithError(401, err)
+ return
+ }
+ // We NEED to block while trying to deliver
+ // this work to ease the network impact.
+ if h.considerWork([]TaskID{TaskID(tID)}) {
+ c.Status(200)
+ }
+ c.Status(202) // NOTE: 202 for "accepted" but not worked.
+ })
+ }
+}
+
+func (e *TaskEngine) bump(taskType string) {
+ var res []string
+ err := e.db.Select(e.ctx, &res, `SELECT host_and_port FROM harmony_machines m
+ JOIN harmony_task_impl i ON i.owner_id=m.id
+ WHERE i.name=$1`, taskType)
+ if err != nil {
+ log.Error("Could not read db for bump: ", err)
+ return
+ }
+ for _, url := range res {
+ resp, err := hClient.Get(url + "/scheduler/bump/" + taskType)
+ if err != nil {
+ log.Info("Server unreachable to bump: ", err)
+ continue
+ }
+ if resp.StatusCode == 200 {
+ return // just want 1 taker.
+ }
+ }
+}
+
+// resourcesInUse requires workListsMutex to be already locked.
+func (e *TaskEngine) resourcesInUse() resources.Resources {
+ tmp := e.reg.Resources
+ for _, t := range e.handlers {
+ ct := t.Count.Load()
+ tmp.Cpu -= int(ct) * t.Cost.Cpu
+ tmp.Gpu -= float64(ct) * t.Cost.Gpu
+ tmp.Ram -= uint64(ct) * t.Cost.Ram
+ }
+ return tmp
+}
diff --git a/lib/harmony/harmonytask/notifyingMx.go b/lib/harmony/harmonytask/notifyingMx.go
new file mode 100644
index 000000000..51c4e0a53
--- /dev/null
+++ b/lib/harmony/harmonytask/notifyingMx.go
@@ -0,0 +1,16 @@
+package harmonytask
+
+import "sync"
+
+type notifyingMx struct {
+ sync.Mutex
+ UnlockNotify func()
+}
+
+func (n *notifyingMx) Unlock() {
+ tmp := n.UnlockNotify
+ n.Mutex.Unlock()
+ if tmp != nil {
+ tmp()
+ }
+}
diff --git a/lib/harmony/harmonytask/taskTypeHandler.go b/lib/harmony/harmonytask/taskTypeHandler.go
new file mode 100644
index 000000000..079f33704
--- /dev/null
+++ b/lib/harmony/harmonytask/taskTypeHandler.go
@@ -0,0 +1,276 @@
+package harmonytask
+
+import (
+ "context"
+ "errors"
+ "io"
+ "net/http"
+ "strconv"
+ "sync/atomic"
+ "time"
+
+ "github.com/filecoin-project/lotus/lib/harmony/harmonydb"
+ logging "github.com/ipfs/go-log/v2"
+)
+
+var log = logging.Logger("harmonytask")
+
+type taskTypeHandler struct {
+ TaskInterface
+ TaskTypeDetails
+ TaskEngine *TaskEngine
+ Count atomic.Int32 /// locked by TaskEngine's mutex
+
+}
+
+func (h *taskTypeHandler) AddTask(extra func(TaskID, *harmonydb.Tx) bool) {
+ var tID TaskID
+ did, err := h.TaskEngine.db.BeginTransaction(h.TaskEngine.ctx, func(tx *harmonydb.Tx) bool {
+ // create taskID (from DB)
+ _, err := tx.Exec(`INSERT INTO harmony_task (name, added_by, posted_time)
+ VALUES ($1, $2, CURRENT_TIMESTAMP) `, h.Name, h.TaskEngine.ownerID)
+ if err != nil {
+ log.Error("Could not insert into harmonyTask", err)
+ return false
+ }
+ err = tx.QueryRow("SELECT id FROM harmony_task ORDER BY update_time DESC LIMIT 1").Scan(&tID)
+ if err != nil {
+ log.Error("Could not select ID: ", err)
+ }
+ return extra(tID, tx)
+ })
+ if err != nil {
+ log.Error(err)
+ }
+ if !did {
+ return
+ }
+
+ if !h.considerWork([]TaskID{tID}) {
+ h.TaskEngine.bump(h.Name) // We can't do it. How about someone else.
+ }
+}
+
+func (h *taskTypeHandler) considerWork(ids []TaskID) (workAccepted bool) {
+ if len(ids) == 0 {
+ return true // stop looking for takers
+ }
+
+ // 1. Can we do any more of this task type?
+ if h.Max > -1 && int(h.Count.Load()) == h.Max {
+ log.Infow("did not accept task", "name", h.Name, "reason", "at max already")
+ return false
+ }
+
+ h.TaskEngine.workAdderMutex.Lock()
+ defer h.TaskEngine.workAdderMutex.Unlock()
+
+ // 2. Can we do any more work?
+ err := h.AssertMachineHasCapacity()
+ if err != nil {
+ log.Info(err)
+ return false
+ }
+
+ // 3. What does the impl say?
+ tID, err := h.CanAccept(ids)
+ if err != nil {
+ log.Error(err)
+ return false
+ }
+ if tID == nil {
+ log.Infow("did not accept task", "task_id", ids[0], "reason", "CanAccept() refused")
+ return false
+ }
+
+ // 4. Can we claim the work for our hostname?
+ ct, err := h.TaskEngine.db.Exec(h.TaskEngine.ctx, "UPDATE harmony_task SET owner_id=$1 WHERE id=$2 AND owner_id IS NULL", h.TaskEngine.ownerID, *tID)
+ if err != nil {
+ log.Error(err)
+ return false
+ }
+ if ct == 0 {
+ log.Infow("did not accept task", "task_id", strconv.Itoa(int(*tID)), "reason", "already Taken")
+ return false
+ }
+
+ go func() {
+ h.Count.Add(1)
+
+ var done bool
+ var doErr error
+ workStart := time.Now()
+
+ defer func() {
+ if r := recover(); r != nil {
+ log.Error("Recovered from a serious error "+
+ "while processing "+h.Name+" task "+strconv.Itoa(int(*tID))+": ", r)
+ }
+ h.Count.Add(-1)
+
+ h.recordCompletion(*tID, workStart, done, doErr)
+ if done {
+ h.triggerCompletionListeners(*tID)
+ }
+
+ h.TaskEngine.tryAllWork <- true // Activate tasks in this machine
+ }()
+
+ done, doErr = h.Do(*tID, func() bool {
+ var owner int
+ // Background here because we don't want GracefulRestart to block this save.
+ err := h.TaskEngine.db.QueryRow(context.Background(),
+ `SELECT owner_id FROM harmony_task WHERE id=$1`, *tID).Scan(&owner)
+ if err != nil {
+ log.Error("Cannot determine ownership: ", err)
+ return false
+ }
+ return owner == h.TaskEngine.ownerID
+ })
+ if doErr != nil {
+ log.Error("Do("+h.Name+", taskID="+strconv.Itoa(int(*tID))+") returned error: ", doErr)
+ }
+ }()
+ return true
+}
+
+func (h *taskTypeHandler) recordCompletion(tID TaskID, workStart time.Time, done bool, doErr error) {
+ workEnd := time.Now()
+
+ cm, err := h.TaskEngine.db.BeginTransaction(h.TaskEngine.ctx, func(tx *harmonydb.Tx) bool {
+ var postedTime time.Time
+ err := tx.QueryRow(`SELECT posted_time FROM harmony_task WHERE id=$1`, tID).Scan(&postedTime)
+ if err != nil {
+ log.Error("Could not log completion: ", err)
+ return false
+ }
+ result := "unspecified error"
+ if done {
+ _, err = tx.Exec("DELETE FROM harmony_task WHERE id=$1", tID)
+ if err != nil {
+ log.Error("Could not log completion: ", err)
+ return false
+ }
+ result = ""
+ } else {
+ if doErr != nil {
+ result = "error: " + doErr.Error()
+ }
+ var deleteTask bool
+ if h.MaxFailures > 0 {
+ ct := uint(0)
+ err = tx.QueryRow(`SELECT count(*) FROM harmony_task_history
+ WHERE task_id=$1 AND result=FALSE`, tID).Scan(&ct)
+ if err != nil {
+ log.Error("Could not read task history:", err)
+ return false
+ }
+ if ct >= h.MaxFailures {
+ deleteTask = true
+ }
+ }
+ if deleteTask {
+ _, err = tx.Exec("DELETE FROM harmony_task WHERE id=$1", tID)
+ if err != nil {
+ log.Error("Could not delete failed job: ", err)
+ return false
+ }
+ // Note: Extra Info is left laying around for later review & clean-up
+ } else {
+ tx.Exec(`UPDATE harmony_task SET owner_id=NULL WHERE id=$1`, tID)
+ if err != nil {
+ log.Error("Could not disown failed task: ", tID, err)
+ return false
+ }
+ }
+ }
+ _, err = tx.Exec(`INSERT INTO harmony_task_history
+ (task_id, name, posted, work_start, work_end, result, err)
+ VALUES ($1, $2, $3, $4, $5, $6, $7)`, tID, h.Name, postedTime, workStart, workEnd, done, result)
+ if err != nil {
+ log.Error("Could not write history: ", err)
+ return false
+ }
+ return true
+ })
+ if err != nil {
+ log.Error("Could not record transaction: ", err)
+ return
+ }
+ if !cm {
+ log.Error("Committing the task records failed")
+ }
+}
+
+func (h *taskTypeHandler) AssertMachineHasCapacity() error {
+ r := h.TaskEngine.resourcesInUse()
+
+ if r.Cpu-h.Cost.Cpu < 0 {
+ return errors.New("Did not accept " + h.Name + " task: out of cpu")
+ }
+ if h.Cost.Ram > r.Ram {
+ return errors.New("Did not accept " + h.Name + " task: out of RAM")
+ }
+ if r.Gpu-h.Cost.Gpu < 0 {
+ return errors.New("Did not accept " + h.Name + " task: out of available GPU")
+ }
+ return nil
+}
+
+var hClient = http.Client{}
+
+func init() {
+ hClient.Timeout = 3 * time.Second
+}
+
+// triggerCompletionListeners does in order:
+// 1. Trigger all in-process followers (b/c it's fast).
+// 2. Trigger all living processes with followers via DB
+// 3. Future followers (think partial upgrade) can read harmony_task_history
+// 3a. The Listen() handles slow follows.
+func (h *taskTypeHandler) triggerCompletionListeners(tID TaskID) {
+ // InProcess (#1 from Description)
+ inProcessDefs := h.TaskEngine.follows[h.Name]
+ inProcessFollowers := make([]string, len(inProcessDefs))
+ for _, fs := range inProcessDefs {
+ if fs.f(tID, fs.h.AddTask) {
+ inProcessFollowers = append(inProcessFollowers, fs.h.Name)
+ }
+ }
+
+ // Over HTTP (#2 from Description)
+ var hps []struct {
+ HostAndPort string
+ ToType string
+ }
+ err := h.TaskEngine.db.Select(h.TaskEngine.ctx, &hps, `SELECT m.host_and_port, to_type
+ FROM harmony_task_follow f JOIN harmony_machines m ON m.id=f.owner_id
+ WHERE from_type=$1 AND to_type NOT IN $2 AND f.owner_id != $3`,
+ h.Name, inProcessFollowers, h.TaskEngine.ownerID)
+ if err != nil {
+ log.Warn("Could not fast-trigger partner processes.", err)
+ return
+ }
+ hostsVisited := map[string]bool{}
+ tasksVisited := map[string]bool{}
+ for _, v := range hps {
+ if hostsVisited[v.HostAndPort] || tasksVisited[v.ToType] {
+ continue
+ }
+ resp, err := hClient.Get(v.HostAndPort + "/scheduler/follows/" + h.Name)
+ if err != nil {
+ log.Warn("Couldn't hit http endpoint: ", err)
+ continue
+ }
+ b, err := io.ReadAll(resp.Body)
+ if err != nil {
+ log.Warn("Couldn't hit http endpoint: ", err)
+ continue
+ }
+ hostsVisited[v.HostAndPort], tasksVisited[v.ToType] = true, true
+ if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusAccepted {
+ log.Error("IO failed for fast nudge: ", string(b))
+ continue
+ }
+ }
+}
diff --git a/lib/harmony/resources/memsys.go b/lib/harmony/resources/memsys.go
new file mode 100644
index 000000000..1a45b5b22
--- /dev/null
+++ b/lib/harmony/resources/memsys.go
@@ -0,0 +1,22 @@
+//go:build darwin || freebsd || openbsd || dragonfly || netbsd
+// +build darwin freebsd openbsd dragonfly netbsd
+
+package resources
+
+import (
+ "encoding/binary"
+ "syscall"
+)
+
+func sysctlUint64(name string) (uint64, error) {
+ s, err := syscall.Sysctl(name)
+ if err != nil {
+ return 0, err
+ }
+ // hack because the string conversion above drops a \0
+ b := []byte(s)
+ if len(b) < 8 {
+ b = append(b, 0)
+ }
+ return binary.LittleEndian.Uint64(b), nil
+}
diff --git a/lib/harmony/resources/resources.go b/lib/harmony/resources/resources.go
new file mode 100644
index 000000000..77200b873
--- /dev/null
+++ b/lib/harmony/resources/resources.go
@@ -0,0 +1,180 @@
+package resources
+
+import (
+ "bytes"
+ "context"
+ "fmt"
+ "os/exec"
+ "regexp"
+ "runtime"
+ "strings"
+ "sync/atomic"
+ "time"
+
+ cl "github.com/Nv7-Github/go-cl"
+ ffi "github.com/filecoin-project/filecoin-ffi"
+ "github.com/filecoin-project/lotus/lib/harmony/harmonydb"
+ logging "github.com/ipfs/go-log/v2"
+ "github.com/pbnjay/memory"
+
+ "golang.org/x/sys/unix"
+
+ "github.com/samber/lo"
+)
+
+var LOOKS_DEAD_TIMEOUT = 10 * time.Minute // Time w/o minute heartbeats
+
+type Resources struct {
+ Cpu int
+ Gpu float64
+ GpuRam uint64
+ Ram uint64
+ MachineID int
+}
+type Reg struct {
+ Resources
+ shutdown atomic.Bool
+}
+
+var logger = logging.Logger("harmonytask")
+
+var lotusRE = regexp.MustCompile("lotus-worker|lotus-harmony|yugabyted")
+
+func Register(db *harmonydb.DB, hostnameAndPort string) (*Reg, error) {
+ var reg Reg
+ var err error
+ reg.Resources, err = getResources()
+ if err != nil {
+ return nil, err
+ }
+ ctx := context.Background()
+ { // Learn our owner_id while updating harmony_machines
+ var ownerID []int
+ err := db.Select(ctx, &ownerID, `SELECT id FROM harmony_machines WHERE host_and_port=$1`, hostnameAndPort)
+ if err != nil {
+ return nil, fmt.Errorf("could not read from harmony_machines: %w", err)
+ }
+ if len(ownerID) == 0 {
+ err = db.QueryRow(ctx, `INSERT INTO harmony_machines
+ (host_and_port, cpu, ram, gpu, gpuram) VALUES
+ ($1,$2,$3,$4,$5) RETURNING id`,
+ hostnameAndPort, reg.Cpu, reg.Ram, reg.Gpu, reg.GpuRam).Scan(&reg.Resources.MachineID)
+ if err != nil {
+ return nil, err
+ }
+
+ } else {
+ reg.MachineID = ownerID[0]
+ _, err := db.Exec(ctx, `UPDATE harmony_machines SET
+ cpu=$1, ram=$2, gpu=$3, gpuram=$4 WHERE id=$6`,
+ reg.Cpu, reg.Ram, reg.Gpu, reg.GpuRam, reg.Resources.MachineID)
+ if err != nil {
+ return nil, err
+ }
+ }
+ CleanupMachines(context.Background(), db)
+ }
+ go func() {
+ for {
+ time.Sleep(time.Minute)
+ if reg.shutdown.Load() {
+ return
+ }
+ _, err := db.Exec(ctx, `UPDATE harmony_machines SET last_contact=CURRENT_TIMESTAMP`)
+ if err != nil {
+ logger.Error("Cannot keepalive ", err)
+ }
+ }
+ }()
+
+ return &reg, nil
+}
+func CleanupMachines(ctx context.Context, db *harmonydb.DB) int {
+ ct, err := db.Exec(ctx, `DELETE FROM harmony_machines WHERE last_contact < $1`,
+ time.Now().Add(-1*LOOKS_DEAD_TIMEOUT))
+ if err != nil {
+ logger.Warn("unable to delete old machines: ", err)
+ }
+ return ct
+}
+
+func (res *Reg) Shutdown() {
+ res.shutdown.Store(true)
+}
+
+func getResources() (res Resources, err error) {
+ b, err := exec.Command(`ps`, `-ef`).CombinedOutput()
+ if err != nil {
+ logger.Warn("Could not safety check for 2+ processes: ", err)
+ } else {
+ found := 0
+ for _, b := range bytes.Split(b, []byte("\n")) {
+ if lotusRE.Match(b) {
+ found++
+ }
+ }
+ if found > 1 {
+ logger.Error("This Lotus process should run alone on a machine. Use CGroup.")
+ }
+ }
+
+ res = Resources{
+ Cpu: runtime.NumCPU(),
+ Ram: memory.FreeMemory(),
+ GpuRam: getGpuRam(),
+ }
+
+ { // GPU boolean
+ gpus, err := ffi.GetGPUDevices()
+ if err != nil {
+ logger.Errorf("getting gpu devices failed: %+v", err)
+ }
+ all := strings.ToLower(strings.Join(gpus, ","))
+ if len(gpus) > 1 || strings.Contains(all, "ati") || strings.Contains(all, "nvidia") {
+ res.Gpu = 1
+ }
+ }
+
+ return res, nil
+}
+
+func getGpuRam() uint64 {
+ platforms, err := cl.GetPlatforms()
+ if err != nil {
+ logger.Error(err)
+ return 0
+ }
+
+ return uint64(lo.SumBy(platforms, func(p *cl.Platform) int64 {
+ d, err := p.GetDevices(cl.DeviceTypeAll)
+ if err != nil {
+ logger.Error(err)
+ return 0
+ }
+ return lo.SumBy(d, func(d *cl.Device) int64 { return d.GlobalMemSize() })
+ }))
+}
+
+func DiskFree(path string) (uint64, error) {
+ s := unix.Statfs_t{}
+ err := unix.Statfs(path, &s)
+ if err != nil {
+ return 0, err
+ }
+
+ return s.Bfree * uint64(s.Bsize), nil
+}
+
+/* NOT for Darwin.
+func GetMemFree() uint64 {
+ in := unix.Sysinfo_t{}
+ err := unix.Sysinfo(&in)
+ if err != nil {
+ return 0
+ }
+ // If this is a 32-bit system, then these fields are
+ // uint32 instead of uint64.
+ // So we always convert to uint64 to match signature.
+ return uint64(in.Freeram) * uint64(in.Unit)
+}
+*/