Update back off mechanism to only back off if the endpoint is not available

This commit is contained in:
Shreerang Kale 2025-08-01 15:53:35 +05:30
parent 307abfc78f
commit 0dc3f060ef
6 changed files with 55 additions and 26 deletions

View File

@ -6,7 +6,7 @@ QUICKNODE_TOKEN=your_token_here
# Proxy settings
PROXY_PORT=8545
CACHE_SIZE_GB=100
CACHE_SIZE_GB=1
BACKOFF_MINUTES=30
# Logging

View File

@ -6,7 +6,7 @@ import diskcache
class Cache:
def __init__(self, cache_dir: str = "./cache", size_limit_gb: int = 100):
def __init__(self, cache_dir: str = "./cache", size_limit_gb: int = 1):
self.cache_dir = cache_dir
self.size_limit_bytes = size_limit_gb * 1024 * 1024 * 1024
self.cache = diskcache.Cache(
@ -57,4 +57,3 @@ class Cache:
"count": stats[0],
"limit_gb": self.size_limit_bytes / (1024 * 1024 * 1024)
}

View File

@ -44,14 +44,15 @@ class CachePolicy:
Returns:
True if the method should be cached, False otherwise
"""
if method in self.CACHEABLE_IMMUTABLE:
if method in self.CACHEABLE_WITH_TTL:
# For getBlock, only cache finalized blocks
if method == 'getBlock':
commitment = self._get_commitment(params)
return commitment == 'finalized'
return True
if method in self.CACHEABLE_WITH_TTL:
if method in self.CACHEABLE_IMMUTABLE:
return True
# Default to not caching unknown methods

View File

@ -34,7 +34,7 @@ Provider class:
Cache class:
- get(method: str, params: dict) -> Optional[response]
- set(method: str, params: dict, response: dict) -> None
- size_check() -> None # Enforce 100GB limit
- size_check() -> None # Enforce 1GB limit
- clear_oldest() -> None # LRU eviction
```
@ -42,7 +42,7 @@ Cache class:
- Use `diskcache` library for simplicity
- Key format: `f"{method}:{json.dumps(params, sort_keys=True)}"`
- Store both HTTP responses and WebSocket messages
- Implement 100GB limit with LRU eviction
- Implement 1GB limit with LRU eviction
### 3. Error Logger Module (`errors.py`)
**Purpose**: SQLite-based error logging with UUID tracking
@ -146,7 +146,7 @@ QUICKNODE_TOKEN=your_token_here
# Proxy settings
PROXY_PORT=8545
CACHE_SIZE_GB=100
CACHE_SIZE_GB=1
BACKOFF_MINUTES=30
# Logging
@ -227,7 +227,7 @@ Happy-path end-to-end tests only:
## Deployment Considerations
1. **Cache Storage**: Need ~100GB disk space
1. **Cache Storage**: Need ~1GB disk space
2. **Memory Usage**: Keep minimal, use disk cache
3. **Concurrent Clients**: Basic round-robin if multiple connect
4. **Monitoring**: Log all errors, provide error IDs
@ -273,7 +273,7 @@ aiohttp-cors==0.7.0
1. Single endpoint proxies to 5 providers
2. Automatic failover works
3. Responses are cached (up to 100GB)
3. Responses are cached (up to 1GB)
4. Errors logged with retrievable IDs
5. Both HTTP and WebSocket work
6. Response format is unified

View File

@ -35,7 +35,7 @@ def load_config() -> dict:
return {
"proxy_port": int(os.getenv("PROXY_PORT", 8545)),
"cache_size_gb": int(os.getenv("CACHE_SIZE_GB", 100)),
"cache_size_gb": int(os.getenv("CACHE_SIZE_GB", 1)),
"backoff_minutes": int(os.getenv("BACKOFF_MINUTES", 30)),
"log_level": os.getenv("LOG_LEVEL", "INFO"),
"error_db_path": os.getenv("ERROR_DB_PATH", "./errors.db"),
@ -72,7 +72,7 @@ def main() -> None:
logger = logging.getLogger(__name__)
logger.info(f"Starting Solana RPC Proxy on port {config['proxy_port']}")
logger.info(f"Intelligent caching enabled - Cache size limit: {config['cache_size_gb']}GB")
logger.info(f"Cache size limit: {config['cache_size_gb']}GB")
logger.info(f"Provider backoff time: {config['backoff_minutes']} minutes")
app = create_app(config)

View File

@ -20,7 +20,7 @@ class Router:
async def route_request(self, method: str, params: Dict[str, Any]) -> Dict[str, Any]:
request = {"method": method, "params": params}
# Check if this method should be cached based on intelligent caching policy
# Check if this method should be cached based on caching policy
should_cache = self.cache_policy.should_cache(method, params)
if should_cache:
@ -60,7 +60,13 @@ class Router:
except Exception as error:
error_id = self.error_logger.log_error(provider.name, request, error)
self.logger.warning(f"Provider {provider.name} failed: {error} (ID: {error_id})")
# Only mark provider as failed for server/network issues, not RPC errors
if await self._is_server_failure(provider, error):
provider.mark_failed()
self.logger.warning(f"Provider {provider.name} marked as failed due to server issue")
else:
self.logger.debug(f"Provider {provider.name} had RPC error but server is available")
return self._create_error_response(
"All providers failed to handle the request",
@ -77,6 +83,29 @@ class Router:
return None
async def _is_server_failure(self, provider: Provider, error: Exception) -> bool:
"""
Check if the provider server is actually down by making a simple health check.
Only mark as failed if server is unreachable.
"""
try:
# Quick health check with minimal timeout
timeout = aiohttp.ClientTimeout(total=5) # 5 second timeout
async with aiohttp.ClientSession(timeout=timeout) as session:
# Try a simple HTTP GET to check server availability
from urllib.parse import urlparse
parsed_url = urlparse(provider.http_url)
health_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
async with session.get(health_url) as response:
# Server responded (even with error codes), so it's alive
return False
except Exception as health_error:
# Server is actually unreachable
self.logger.debug(f"Health check failed for {provider.name}: {health_error}")
return True
async def _make_request(self, provider: Provider, request: Dict[str, Any]) -> Dict[str, Any]:
transformed_request = provider.transform_request(request)