fix: cache Neo4j driver failure state to avoid 60s retry on every request

get_global_driver() now sets _driver_unavailable=True when the initial
connection fails, so subsequent calls fail immediately instead of
spending 60s retrying each time. Added reset_global_driver() to allow
manual reconnection after Neo4j comes back up.

Also fixes APP_BOLT_URL in .env: was bolt://bolt.classroomcopilot.ai
(public IP, port not exposed), now bolt://192.168.0.209:7687 (LAN).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
kcar 2026-05-21 17:26:21 +00:00
parent 7ca21ef538
commit 84f7fa9de1

View File

@ -18,7 +18,7 @@ def _retry_with_backoff(
) -> any: ) -> any:
""" """
Helper function to retry operations with exponential backoff. Helper function to retry operations with exponential backoff.
Args: Args:
func: Function to retry func: Function to retry
max_attempts: Maximum number of retry attempts max_attempts: Maximum number of retry attempts
@ -29,26 +29,26 @@ def _retry_with_backoff(
attempt = 0 attempt = 0
delay = initial_delay delay = initial_delay
start_time = time.time() start_time = time.time()
while attempt < max_attempts: while attempt < max_attempts:
try: try:
return func() return func()
except Exception as e: except Exception as e:
attempt += 1 attempt += 1
elapsed_time = time.time() - start_time elapsed_time = time.time() - start_time
# Check if we've exceeded the maximum total wait time # Check if we've exceeded the maximum total wait time
if elapsed_time >= max_total_wait: if elapsed_time >= max_total_wait:
logger.error(f"Exceeded maximum total wait time of {max_total_wait} seconds") logger.error(f"Exceeded maximum total wait time of {max_total_wait} seconds")
raise raise
if attempt == max_attempts: if attempt == max_attempts:
logger.error(f"Final attempt {attempt} failed: {e}") logger.error(f"Final attempt {attempt} failed: {e}")
raise raise
# Calculate next delay with exponential backoff, but cap it # Calculate next delay with exponential backoff, but cap it
delay = min(delay * 2, max_delay) delay = min(delay * 2, max_delay)
# If we're in a container initialization scenario, provide more context # If we're in a container initialization scenario, provide more context
if "Connection refused" in str(e): if "Connection refused" in str(e):
logger.warning( logger.warning(
@ -59,7 +59,7 @@ def _retry_with_backoff(
) )
else: else:
logger.warning(f"Attempt {attempt} failed: {e}. Retrying in {delay:.1f} seconds...") logger.warning(f"Attempt {attempt} failed: {e}. Retrying in {delay:.1f} seconds...")
time.sleep(delay) time.sleep(delay)
def get_driver(db_name: Optional[str] = None, url: Optional[str] = None, auth: Optional[Tuple[str, str]] = None) -> Optional[Driver]: def get_driver(db_name: Optional[str] = None, url: Optional[str] = None, auth: Optional[Tuple[str, str]] = None) -> Optional[Driver]:
@ -71,7 +71,7 @@ def get_driver(db_name: Optional[str] = None, url: Optional[str] = None, auth: O
logger.error("Neo4j credentials not found in environment") logger.error("Neo4j credentials not found in environment")
return None return None
auth = (username, password) auth = (username, password)
if auth is None: if auth is None:
logger.error("No authentication credentials provided") logger.error("No authentication credentials provided")
return None return None
@ -95,7 +95,7 @@ def get_driver(db_name: Optional[str] = None, url: Optional[str] = None, auth: O
except Exception as e: except Exception as e:
logger.error(f"Failed to establish Neo4j connection after all retries: {e}") logger.error(f"Failed to establish Neo4j connection after all retries: {e}")
return None return None
# Test the connection with the specific database # Test the connection with the specific database
if db_name and driver: if db_name and driver:
def verify_database(): def verify_database():
@ -127,27 +127,50 @@ def close_driver(driver: Optional[Driver]) -> None:
logger.info("Closing driver") logger.info("Closing driver")
driver.close() driver.close()
# Global driver instance # Global driver instance — None means not yet initialised, _driver_unavailable=True means connection failed
_driver: Optional[Driver] = None _driver: Optional[Driver] = None
_driver_unavailable: bool = False
def get_global_driver() -> Optional[Driver]: def get_global_driver() -> Optional[Driver]:
"""Get or create the global Neo4j driver instance.""" """Get or create the global Neo4j driver instance.
global _driver
Caches both success and failure so a broken Neo4j connection causes
a single 60-second retry at startup, then fast-fails on every
subsequent call instead of hanging for 60s each time.
"""
global _driver, _driver_unavailable
if _driver_unavailable:
return None
if _driver is None: if _driver is None:
_driver = get_driver() _driver = get_driver()
if _driver is None:
_driver_unavailable = True
logger.error("Neo4j driver unavailable — all subsequent Neo4j calls will fail fast until process restarts")
return _driver return _driver
def reset_global_driver() -> None:
"""Reset the cached driver, forcing a reconnection attempt on the next call.
Call this if Neo4j becomes available after the process started.
"""
global _driver, _driver_unavailable
if _driver:
close_driver(_driver)
_driver = None
_driver_unavailable = False
logger.info("Global Neo4j driver reset — will reconnect on next request")
@contextmanager @contextmanager
def get_session(database: Optional[str] = None) -> Generator[Session, None, None]: def get_session(database: Optional[str] = None) -> Generator[Session, None, None]:
"""Get a Neo4j session using the global driver.""" """Get a Neo4j session using the global driver."""
driver = get_global_driver() driver = get_global_driver()
if driver is None: if driver is None:
raise Exception("Failed to get Neo4j driver") raise Exception("Failed to get Neo4j driver")
session = None session = None
try: try:
session = driver.session(database=database) session = driver.session(database=database)
yield session yield session
finally: finally:
if session: if session:
session.close() session.close()