""" Comprehensive Redis Management System ===================================== Handles environment-specific Redis configuration, service management, task recovery, and health monitoring for ClassroomCopilot. Features: - Environment isolation (dev/prod/test databases) - Automatic service management and health checks - Task recovery and persistence strategies - Graceful degradation and error handling """ import os import redis import subprocess import time import signal import json import logging from typing import Dict, List, Optional, Any, Tuple from dataclasses import dataclass from enum import Enum logger = logging.getLogger(__name__) class Environment(Enum): DEV = "dev" PROD = "prod" TEST = "test" @dataclass class RedisConfig: host: str port: int db: int password: Optional[str] ssl: bool persist: bool task_ttl: int url: str class RedisManager: """Comprehensive Redis management with environment isolation and recovery.""" def __init__(self, environment: Environment = Environment.DEV): self.environment = environment self.config = self._load_config() self.client: Optional[redis.Redis] = None self._subprocess: Optional[subprocess.Popen] = None self._health_check_enabled = True logger.info(f"๐Ÿ”ง Redis Manager initialized for {environment.value} environment") logger.info(f"๐Ÿ“ก Target: {self.config.host}:{self.config.port}/db{self.config.db}") def _load_config(self) -> RedisConfig: """Load environment-specific Redis configuration.""" # Base configuration host = os.getenv('REDIS_HOST', 'localhost') port = int(os.getenv('REDIS_PORT', '6379')) password = os.getenv('REDIS_PASSWORD') or None ssl = os.getenv('REDIS_SSL', 'false').lower() == 'true' # Environment-specific settings if self.environment == Environment.DEV: db = int(os.getenv('REDIS_DB_DEV', '0')) persist = os.getenv('REDIS_PERSIST_DEV', 'false').lower() == 'true' task_ttl = int(os.getenv('REDIS_TASK_TTL_DEV', '3600')) elif self.environment == Environment.PROD: db = int(os.getenv('REDIS_DB_PROD', '1')) persist = os.getenv('REDIS_PERSIST_PROD', 'true').lower() == 'true' task_ttl = int(os.getenv('REDIS_TASK_TTL_PROD', '86400')) else: # TEST db = int(os.getenv('REDIS_DB_TEST', '2')) persist = False task_ttl = int(os.getenv('REDIS_TASK_TTL_TEST', '1800')) # Construct URL auth_part = f":{password}@" if password else "" url = f"redis://{auth_part}{host}:{port}/{db}" return RedisConfig( host=host, port=port, db=db, password=password, ssl=ssl, persist=persist, task_ttl=task_ttl, url=url ) def ensure_service_running(self) -> bool: """Ensure Redis service is running, start if needed.""" # Check if Redis is already running if self._is_redis_running(): logger.info("โœ… Redis service already running") return True # Try to start Redis service logger.info("๐Ÿš€ Starting Redis service...") # Try systemctl first (Linux production) if self._try_systemctl_start(): return True # Try brew services (macOS) if self._try_brew_start(): return True # Try direct Redis server start if self._try_direct_start(): return True # Try Docker fallback if self._try_docker_start(): return True logger.error("โŒ Failed to start Redis service with all methods") return False def _is_redis_running(self) -> bool: """Check if Redis is accessible.""" try: test_client = redis.Redis( host=self.config.host, port=self.config.port, socket_connect_timeout=2, socket_timeout=2 ) test_client.ping() test_client.close() return True except Exception: return False def _try_systemctl_start(self) -> bool: """Try starting Redis with systemctl.""" try: if not os.path.exists('/usr/bin/systemctl'): return False subprocess.run(['sudo', 'systemctl', 'start', 'redis'], check=True, capture_output=True, timeout=10) time.sleep(2) return self._is_redis_running() except Exception: return False def _try_brew_start(self) -> bool: """Try starting Redis with brew services.""" try: if not os.path.exists('/opt/homebrew/bin/brew'): return False subprocess.run(['/opt/homebrew/bin/brew', 'services', 'start', 'redis'], check=True, capture_output=True, timeout=10) time.sleep(2) return self._is_redis_running() except Exception: return False def _try_direct_start(self) -> bool: """Try starting Redis server directly.""" try: # Find redis-server binary redis_cmd = None for path in ['/opt/homebrew/bin/redis-server', '/usr/local/bin/redis-server', 'redis-server']: try: subprocess.run([path, '--version'], capture_output=True, check=True, timeout=5) redis_cmd = path break except Exception: continue if not redis_cmd: return False # Start Redis with appropriate config config_args = [ redis_cmd, '--port', str(self.config.port), '--bind', self.config.host, '--protected-mode', 'no', '--loglevel', 'notice', '--daemonize', 'yes' # Run as daemon ] # Add persistence settings if not self.config.persist: config_args.extend(['--save', '', '--appendonly', 'no']) else: config_args.extend(['--save', '60 1000', '--appendonly', 'yes']) subprocess.run(config_args, check=True, capture_output=True, timeout=10) time.sleep(3) return self._is_redis_running() except Exception as e: logger.debug(f"Direct Redis start failed: {e}") return False def _try_docker_start(self) -> bool: """Try starting Redis with Docker.""" try: subprocess.run(['docker', '--version'], capture_output=True, check=True, timeout=5) # Check if Redis container already exists result = subprocess.run( ['docker', 'ps', '-a', '--filter', 'name=classroomcopilot-redis', '--format', '{{.Names}}'], capture_output=True, text=True, timeout=10 ) if 'classroomcopilot-redis' in result.stdout: # Start existing container subprocess.run(['docker', 'start', 'classroomcopilot-redis'], check=True, capture_output=True, timeout=10) else: # Create new container docker_cmd = [ 'docker', 'run', '-d', '--name', 'classroomcopilot-redis', '-p', f'{self.config.port}:6379', 'redis:alpine' ] if not self.config.persist: docker_cmd.extend(['redis-server', '--save', '', '--appendonly', 'no']) subprocess.run(docker_cmd, check=True, capture_output=True, timeout=30) time.sleep(3) return self._is_redis_running() except Exception as e: logger.debug(f"Docker Redis start failed: {e}") return False def connect(self) -> bool: """Establish Redis connection with retry logic.""" max_attempts = int(os.getenv('REDIS_MAX_RETRY_ATTEMPTS', '3')) for attempt in range(1, max_attempts + 1): try: logger.info(f"๐Ÿ”Œ Connecting to Redis... (attempt {attempt}/{max_attempts})") self.client = redis.Redis( host=self.config.host, port=self.config.port, db=self.config.db, password=self.config.password, decode_responses=True, socket_connect_timeout=5, socket_timeout=5, retry_on_timeout=True ) # Test connection self.client.ping() logger.info(f"โœ… Connected to Redis {self.config.host}:{self.config.port}/db{self.config.db}") return True except Exception as e: logger.warning(f"โŒ Connection attempt {attempt} failed: {e}") if attempt < max_attempts: logger.info("โณ Retrying in 2 seconds...") time.sleep(2) else: logger.error("๐Ÿ’ฅ All connection attempts failed") return False return False def initialize_environment(self) -> bool: """Initialize Redis environment based on mode.""" if not self.ensure_service_running(): logger.error("Cannot initialize - Redis service not available") return False if not self.connect(): logger.error("Cannot initialize - Connection failed") return False if self.environment == Environment.DEV: return self._initialize_dev_environment() elif self.environment == Environment.PROD: return self._initialize_prod_environment() else: return self._initialize_test_environment() def _initialize_dev_environment(self) -> bool: """Initialize development environment - clean slate.""" try: logger.info("๐Ÿงน DEV MODE: Clearing all data for clean startup...") # Get all keys in this database all_keys = self.client.keys('*') if all_keys: # Nuclear option - clear everything in this DB self.client.flushdb() logger.info(f"๐Ÿ’ฅ DEV MODE: Nuked {len(all_keys)} keys for clean startup") else: logger.info("โœ… DEV MODE: Database already clean") # Set up development-specific config try: self.client.config_set('save', '') # Disable RDB snapshots self.client.config_set('appendonly', 'no') # Disable AOF except Exception as e: logger.debug(f"Could not set Redis config (may not have permissions): {e}") logger.info("๐ŸŽฏ DEV MODE: Environment initialized for fast iteration") return True except Exception as e: logger.error(f"Failed to initialize dev environment: {e}") return False def _initialize_prod_environment(self) -> bool: """Initialize production environment - preserve data.""" try: logger.info("๐Ÿญ PROD MODE: Initializing with data preservation...") # Check for existing tasks and report task_keys = self.client.keys('task:*') queue_keys = self.client.keys('queue:*') processing_keys = self.client.keys('processing:*') logger.info(f"๐Ÿ“Š PROD MODE: Found {len(task_keys)} tasks, {len(queue_keys)} queues, {len(processing_keys)} processing counters") # Enable persistence if self.config.persist: try: self.client.config_set('save', '60 1000') # Save every 60s if โ‰ฅ1000 changes self.client.config_set('appendonly', 'yes') # Enable AOF logger.info("๐Ÿ’พ PROD MODE: Persistence enabled") except Exception as e: logger.debug(f"Could not set Redis config (may not have permissions): {e}") # Recover any stuck tasks self._recover_stuck_tasks() logger.info("โœ… PROD MODE: Environment initialized with data recovery") return True except Exception as e: logger.error(f"Failed to initialize prod environment: {e}") return False def _initialize_test_environment(self) -> bool: """Initialize test environment - isolated and clean.""" try: logger.info("๐Ÿงช TEST MODE: Setting up isolated test environment...") # Clear test database self.client.flushdb() # Disable persistence for speed try: self.client.config_set('save', '') self.client.config_set('appendonly', 'no') except Exception as e: logger.debug(f"Could not set Redis config (may not have permissions): {e}") logger.info("โœ… TEST MODE: Clean, isolated environment ready") return True except Exception as e: logger.error(f"Failed to initialize test environment: {e}") return False def _recover_stuck_tasks(self): """Recover tasks that were processing when system shut down.""" try: logger.info("๐Ÿ”„ Recovering stuck tasks from previous session...") # Get all processing tasks processing_data = self.client.hgetall('processing') if not processing_data: logger.info("โœ… No stuck tasks to recover") return recovered_count = 0 for task_id, task_info in processing_data.items(): try: # Parse task info info = json.loads(task_info) # Check if task still exists task_key = f"task:{task_id}" if not self.client.exists(task_key): logger.warning(f"โš ๏ธ Task {task_id} data missing, removing from processing") self.client.hdel('processing', task_id) continue # Reset task to pending and re-queue self.client.hset(task_key, 'status', 'pending') self.client.hdel(task_key, 'started_at') # Add back to appropriate queue priority = info.get('priority', 'normal') queue_key = f"queue:{priority}" self.client.lpush(queue_key, task_id) # Remove from processing self.client.hdel('processing', task_id) # Reset service counters service = info.get('service') if service: service_key = f"processing:{service}" current_count = int(self.client.get(service_key) or 0) if current_count > 0: self.client.decr(service_key) recovered_count += 1 logger.info(f"๐Ÿ”„ Recovered stuck task {task_id} ({service}/{priority})") except Exception as e: logger.error(f"Failed to recover task {task_id}: {e}") # Remove problematic entry self.client.hdel('processing', task_id) logger.info(f"โœ… Recovered {recovered_count} stuck tasks") except Exception as e: logger.error(f"Task recovery failed: {e}") def health_check(self) -> Dict[str, Any]: """Comprehensive health check.""" health = { 'status': 'healthy', 'environment': self.environment.value, 'database': self.config.db, 'connection': False, 'memory_usage': None, 'queue_stats': {}, 'error': None } try: if not self.client: raise Exception("No Redis connection") # Test connection self.client.ping() health['connection'] = True # Get memory usage info = self.client.info('memory') health['memory_usage'] = { 'used_memory_human': info.get('used_memory_human', 'unknown'), 'used_memory_peak_human': info.get('used_memory_peak_human', 'unknown') } # Get queue statistics health['queue_stats'] = { 'total_keys': len(self.client.keys('*')), 'tasks': len(self.client.keys('task:*')), 'queues': { 'high': self.client.llen('queue:high'), 'normal': self.client.llen('queue:normal'), 'low': self.client.llen('queue:low') }, 'processing': self.client.hlen('processing'), 'dead_letter': self.client.llen('dead_letter') } except Exception as e: health['status'] = 'unhealthy' health['error'] = str(e) return health def shutdown(self, force: bool = False): """Graceful shutdown with optional data preservation.""" if self.environment == Environment.DEV and not force: logger.info("๐Ÿงน DEV MODE: Clearing data on shutdown...") try: if self.client: self.client.flushdb() logger.info("โœ… Dev data cleared") except Exception as e: logger.warning(f"Failed to clear dev data: {e}") # Close connection if self.client: try: self.client.close() logger.info("๐Ÿ”Œ Redis connection closed") except Exception as e: logger.warning(f"Error closing Redis connection: {e}") # Stop subprocess if we started it if self._subprocess: try: self._subprocess.terminate() self._subprocess.wait(timeout=5) logger.info("๐Ÿ›‘ Redis subprocess stopped") except Exception as e: logger.warning(f"Error stopping Redis subprocess: {e}") logger.info(f"โœ… Redis manager shutdown complete ({self.environment.value})") # Convenience functions for backward compatibility def get_redis_manager(environment: str = "dev") -> RedisManager: """Get a Redis manager instance for the specified environment.""" env = Environment(environment.lower()) return RedisManager(env) def ensure_redis_running(environment: str = "dev") -> bool: """Ensure Redis is running for the specified environment.""" manager = get_redis_manager(environment) return manager.ensure_service_running()