Update back off mechanism to only back off if the endpoint is not available

2025-08-01 15:53:35 +05:30 · 2025-08-01 15:53:35 +05:30 · 0dc3f060ef
commit 0dc3f060ef
parent 307abfc78f
6 changed files with 55 additions and 26 deletions
--- a/.env.example
+++ b/.env.example
@ -6,7 +6,7 @@ QUICKNODE_TOKEN=your_token_here

 # Proxy settings
 PROXY_PORT=8545
-CACHE_SIZE_GB=100
+CACHE_SIZE_GB=1
 BACKOFF_MINUTES=30

 # Logging
--- a/cache.py
+++ b/cache.py
@ -6,7 +6,7 @@ import diskcache


 class Cache:
-    def __init__(self, cache_dir: str = "./cache", size_limit_gb: int = 100):
+    def __init__(self, cache_dir: str = "./cache", size_limit_gb: int = 1):
        self.cache_dir = cache_dir
        self.size_limit_bytes = size_limit_gb * 1024 * 1024 * 1024
        self.cache = diskcache.Cache(
@ -57,4 +57,3 @@ class Cache:
            "count": stats[0],
            "limit_gb": self.size_limit_bytes / (1024 * 1024 * 1024)
        }
-    
--- a/cache_policy.py
+++ b/cache_policy.py
@ -44,14 +44,15 @@ class CachePolicy:
        Returns:
            True if the method should be cached, False otherwise
        """
-        if method in self.CACHEABLE_IMMUTABLE:
+
+        if method in self.CACHEABLE_WITH_TTL:
            # For getBlock, only cache finalized blocks
            if method == 'getBlock':
                commitment = self._get_commitment(params)
                return commitment == 'finalized'
            return True

-        if method in self.CACHEABLE_WITH_TTL:
+        if method in self.CACHEABLE_IMMUTABLE:
            return True

        # Default to not caching unknown methods
--- a/docs/solana-proxy-implementation-plan.md
+++ b/docs/solana-proxy-implementation-plan.md
@ -34,7 +34,7 @@ Provider class:
 Cache class:
 - get(method: str, params: dict) -> Optional[response]
 - set(method: str, params: dict, response: dict) -> None
- size_check() -> None  # Enforce 100GB limit
+- size_check() -> None  # Enforce 1GB limit
 - clear_oldest() -> None  # LRU eviction
 ```

@ -42,7 +42,7 @@ Cache class:
 - Use `diskcache` library for simplicity
 - Key format: `f"{method}:{json.dumps(params, sort_keys=True)}"`
 - Store both HTTP responses and WebSocket messages
- Implement 100GB limit with LRU eviction
+- Implement 1GB limit with LRU eviction

 ### 3. Error Logger Module (`errors.py`)
 **Purpose**: SQLite-based error logging with UUID tracking
@ -146,7 +146,7 @@ QUICKNODE_TOKEN=your_token_here

 # Proxy settings
 PROXY_PORT=8545
-CACHE_SIZE_GB=100
+CACHE_SIZE_GB=1
 BACKOFF_MINUTES=30

 # Logging
@ -227,7 +227,7 @@ Happy-path end-to-end tests only:

 ## Deployment Considerations

-1. **Cache Storage**: Need ~100GB disk space
+1. **Cache Storage**: Need ~1GB disk space
 2. **Memory Usage**: Keep minimal, use disk cache
 3. **Concurrent Clients**: Basic round-robin if multiple connect
 4. **Monitoring**: Log all errors, provide error IDs
@ -273,7 +273,7 @@ aiohttp-cors==0.7.0

 1. Single endpoint proxies to 5 providers
 2. Automatic failover works
-3. Responses are cached (up to 100GB)
+3. Responses are cached (up to 1GB)
 4. Errors logged with retrievable IDs
 5. Both HTTP and WebSocket work
 6. Response format is unified
--- a/main.py
+++ b/main.py
@ -35,7 +35,7 @@ def load_config() -> dict:

    return {
        "proxy_port": int(os.getenv("PROXY_PORT", 8545)),
-        "cache_size_gb": int(os.getenv("CACHE_SIZE_GB", 100)),
+        "cache_size_gb": int(os.getenv("CACHE_SIZE_GB", 1)),
        "backoff_minutes": int(os.getenv("BACKOFF_MINUTES", 30)),
        "log_level": os.getenv("LOG_LEVEL", "INFO"),
        "error_db_path": os.getenv("ERROR_DB_PATH", "./errors.db"),
@ -72,7 +72,7 @@ def main() -> None:

    logger = logging.getLogger(__name__)
    logger.info(f"Starting Solana RPC Proxy on port {config['proxy_port']}")
-    logger.info(f"Intelligent caching enabled - Cache size limit: {config['cache_size_gb']}GB")
+    logger.info(f"Cache size limit: {config['cache_size_gb']}GB")
    logger.info(f"Provider backoff time: {config['backoff_minutes']} minutes")

    app = create_app(config)
--- a/router.py
+++ b/router.py
@ -20,7 +20,7 @@ class Router:
    async def route_request(self, method: str, params: Dict[str, Any]) -> Dict[str, Any]:
        request = {"method": method, "params": params}

-        # Check if this method should be cached based on intelligent caching policy
+        # Check if this method should be cached based on caching policy
        should_cache = self.cache_policy.should_cache(method, params)

        if should_cache:
@ -60,7 +60,13 @@ class Router:
            except Exception as error:
                error_id = self.error_logger.log_error(provider.name, request, error)
                self.logger.warning(f"Provider {provider.name} failed: {error} (ID: {error_id})")
+
+                # Only mark provider as failed for server/network issues, not RPC errors
+                if await self._is_server_failure(provider, error):
                    provider.mark_failed()
+                    self.logger.warning(f"Provider {provider.name} marked as failed due to server issue")
+                else:
+                    self.logger.debug(f"Provider {provider.name} had RPC error but server is available")

        return self._create_error_response(
            "All providers failed to handle the request",
@ -77,6 +83,29 @@ class Router:

        return None

+    async def _is_server_failure(self, provider: Provider, error: Exception) -> bool:
+        """
+        Check if the provider server is actually down by making a simple health check.
+        Only mark as failed if server is unreachable.
+        """
+        try:
+            # Quick health check with minimal timeout
+            timeout = aiohttp.ClientTimeout(total=5)  # 5 second timeout
+            async with aiohttp.ClientSession(timeout=timeout) as session:
+                # Try a simple HTTP GET to check server availability
+                from urllib.parse import urlparse
+                parsed_url = urlparse(provider.http_url)
+                health_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
+
+                async with session.get(health_url) as response:
+                    # Server responded (even with error codes), so it's alive
+                    return False
+
+        except Exception as health_error:
+            # Server is actually unreachable
+            self.logger.debug(f"Health check failed for {provider.name}: {health_error}")
+            return True
+
    async def _make_request(self, provider: Provider, request: Dict[str, Any]) -> Dict[str, Any]:
        transformed_request = provider.transform_request(request)