Update back off mechanism to only back off if the endpoint is not available

This commit is contained in:
Shreerang Kale 2025-08-01 15:53:35 +05:30
parent 307abfc78f
commit 0dc3f060ef
6 changed files with 55 additions and 26 deletions

View File

@ -6,7 +6,7 @@ QUICKNODE_TOKEN=your_token_here
# Proxy settings # Proxy settings
PROXY_PORT=8545 PROXY_PORT=8545
CACHE_SIZE_GB=100 CACHE_SIZE_GB=1
BACKOFF_MINUTES=30 BACKOFF_MINUTES=30
# Logging # Logging

View File

@ -6,7 +6,7 @@ import diskcache
class Cache: class Cache:
def __init__(self, cache_dir: str = "./cache", size_limit_gb: int = 100): def __init__(self, cache_dir: str = "./cache", size_limit_gb: int = 1):
self.cache_dir = cache_dir self.cache_dir = cache_dir
self.size_limit_bytes = size_limit_gb * 1024 * 1024 * 1024 self.size_limit_bytes = size_limit_gb * 1024 * 1024 * 1024
self.cache = diskcache.Cache( self.cache = diskcache.Cache(
@ -57,4 +57,3 @@ class Cache:
"count": stats[0], "count": stats[0],
"limit_gb": self.size_limit_bytes / (1024 * 1024 * 1024) "limit_gb": self.size_limit_bytes / (1024 * 1024 * 1024)
} }

View File

@ -44,14 +44,15 @@ class CachePolicy:
Returns: Returns:
True if the method should be cached, False otherwise True if the method should be cached, False otherwise
""" """
if method in self.CACHEABLE_IMMUTABLE:
if method in self.CACHEABLE_WITH_TTL:
# For getBlock, only cache finalized blocks # For getBlock, only cache finalized blocks
if method == 'getBlock': if method == 'getBlock':
commitment = self._get_commitment(params) commitment = self._get_commitment(params)
return commitment == 'finalized' return commitment == 'finalized'
return True return True
if method in self.CACHEABLE_WITH_TTL: if method in self.CACHEABLE_IMMUTABLE:
return True return True
# Default to not caching unknown methods # Default to not caching unknown methods

View File

@ -34,7 +34,7 @@ Provider class:
Cache class: Cache class:
- get(method: str, params: dict) -> Optional[response] - get(method: str, params: dict) -> Optional[response]
- set(method: str, params: dict, response: dict) -> None - set(method: str, params: dict, response: dict) -> None
- size_check() -> None # Enforce 100GB limit - size_check() -> None # Enforce 1GB limit
- clear_oldest() -> None # LRU eviction - clear_oldest() -> None # LRU eviction
``` ```
@ -42,7 +42,7 @@ Cache class:
- Use `diskcache` library for simplicity - Use `diskcache` library for simplicity
- Key format: `f"{method}:{json.dumps(params, sort_keys=True)}"` - Key format: `f"{method}:{json.dumps(params, sort_keys=True)}"`
- Store both HTTP responses and WebSocket messages - Store both HTTP responses and WebSocket messages
- Implement 100GB limit with LRU eviction - Implement 1GB limit with LRU eviction
### 3. Error Logger Module (`errors.py`) ### 3. Error Logger Module (`errors.py`)
**Purpose**: SQLite-based error logging with UUID tracking **Purpose**: SQLite-based error logging with UUID tracking
@ -146,7 +146,7 @@ QUICKNODE_TOKEN=your_token_here
# Proxy settings # Proxy settings
PROXY_PORT=8545 PROXY_PORT=8545
CACHE_SIZE_GB=100 CACHE_SIZE_GB=1
BACKOFF_MINUTES=30 BACKOFF_MINUTES=30
# Logging # Logging
@ -227,7 +227,7 @@ Happy-path end-to-end tests only:
## Deployment Considerations ## Deployment Considerations
1. **Cache Storage**: Need ~100GB disk space 1. **Cache Storage**: Need ~1GB disk space
2. **Memory Usage**: Keep minimal, use disk cache 2. **Memory Usage**: Keep minimal, use disk cache
3. **Concurrent Clients**: Basic round-robin if multiple connect 3. **Concurrent Clients**: Basic round-robin if multiple connect
4. **Monitoring**: Log all errors, provide error IDs 4. **Monitoring**: Log all errors, provide error IDs
@ -273,7 +273,7 @@ aiohttp-cors==0.7.0
1. Single endpoint proxies to 5 providers 1. Single endpoint proxies to 5 providers
2. Automatic failover works 2. Automatic failover works
3. Responses are cached (up to 100GB) 3. Responses are cached (up to 1GB)
4. Errors logged with retrievable IDs 4. Errors logged with retrievable IDs
5. Both HTTP and WebSocket work 5. Both HTTP and WebSocket work
6. Response format is unified 6. Response format is unified

View File

@ -35,7 +35,7 @@ def load_config() -> dict:
return { return {
"proxy_port": int(os.getenv("PROXY_PORT", 8545)), "proxy_port": int(os.getenv("PROXY_PORT", 8545)),
"cache_size_gb": int(os.getenv("CACHE_SIZE_GB", 100)), "cache_size_gb": int(os.getenv("CACHE_SIZE_GB", 1)),
"backoff_minutes": int(os.getenv("BACKOFF_MINUTES", 30)), "backoff_minutes": int(os.getenv("BACKOFF_MINUTES", 30)),
"log_level": os.getenv("LOG_LEVEL", "INFO"), "log_level": os.getenv("LOG_LEVEL", "INFO"),
"error_db_path": os.getenv("ERROR_DB_PATH", "./errors.db"), "error_db_path": os.getenv("ERROR_DB_PATH", "./errors.db"),
@ -72,7 +72,7 @@ def main() -> None:
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.info(f"Starting Solana RPC Proxy on port {config['proxy_port']}") logger.info(f"Starting Solana RPC Proxy on port {config['proxy_port']}")
logger.info(f"Intelligent caching enabled - Cache size limit: {config['cache_size_gb']}GB") logger.info(f"Cache size limit: {config['cache_size_gb']}GB")
logger.info(f"Provider backoff time: {config['backoff_minutes']} minutes") logger.info(f"Provider backoff time: {config['backoff_minutes']} minutes")
app = create_app(config) app = create_app(config)

View File

@ -20,7 +20,7 @@ class Router:
async def route_request(self, method: str, params: Dict[str, Any]) -> Dict[str, Any]: async def route_request(self, method: str, params: Dict[str, Any]) -> Dict[str, Any]:
request = {"method": method, "params": params} request = {"method": method, "params": params}
# Check if this method should be cached based on intelligent caching policy # Check if this method should be cached based on caching policy
should_cache = self.cache_policy.should_cache(method, params) should_cache = self.cache_policy.should_cache(method, params)
if should_cache: if should_cache:
@ -60,7 +60,13 @@ class Router:
except Exception as error: except Exception as error:
error_id = self.error_logger.log_error(provider.name, request, error) error_id = self.error_logger.log_error(provider.name, request, error)
self.logger.warning(f"Provider {provider.name} failed: {error} (ID: {error_id})") self.logger.warning(f"Provider {provider.name} failed: {error} (ID: {error_id})")
# Only mark provider as failed for server/network issues, not RPC errors
if await self._is_server_failure(provider, error):
provider.mark_failed() provider.mark_failed()
self.logger.warning(f"Provider {provider.name} marked as failed due to server issue")
else:
self.logger.debug(f"Provider {provider.name} had RPC error but server is available")
return self._create_error_response( return self._create_error_response(
"All providers failed to handle the request", "All providers failed to handle the request",
@ -77,6 +83,29 @@ class Router:
return None return None
async def _is_server_failure(self, provider: Provider, error: Exception) -> bool:
"""
Check if the provider server is actually down by making a simple health check.
Only mark as failed if server is unreachable.
"""
try:
# Quick health check with minimal timeout
timeout = aiohttp.ClientTimeout(total=5) # 5 second timeout
async with aiohttp.ClientSession(timeout=timeout) as session:
# Try a simple HTTP GET to check server availability
from urllib.parse import urlparse
parsed_url = urlparse(provider.http_url)
health_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
async with session.get(health_url) as response:
# Server responded (even with error codes), so it's alive
return False
except Exception as health_error:
# Server is actually unreachable
self.logger.debug(f"Health check failed for {provider.name}: {health_error}")
return True
async def _make_request(self, provider: Provider, request: Dict[str, Any]) -> Dict[str, Any]: async def _make_request(self, provider: Provider, request: Dict[str, Any]) -> Dict[str, Any]:
transformed_request = provider.transform_request(request) transformed_request = provider.transform_request(request)