api/modules/redis_manager.py
2025-11-14 14:47:19 +00:00

524 lines
20 KiB
Python

"""
Comprehensive Redis Management System
=====================================
Handles environment-specific Redis configuration, service management,
task recovery, and health monitoring for ClassroomCopilot.
Features:
- Environment isolation (dev/prod/test databases)
- Automatic service management and health checks
- Task recovery and persistence strategies
- Graceful degradation and error handling
"""
import os
import redis
import subprocess
import time
import signal
import json
import logging
from typing import Dict, List, Optional, Any, Tuple
from dataclasses import dataclass
from enum import Enum
logger = logging.getLogger(__name__)
class Environment(Enum):
DEV = "dev"
PROD = "prod"
TEST = "test"
@dataclass
class RedisConfig:
host: str
port: int
db: int
password: Optional[str]
ssl: bool
persist: bool
task_ttl: int
url: str
class RedisManager:
"""Comprehensive Redis management with environment isolation and recovery."""
def __init__(self, environment: Environment = Environment.DEV):
self.environment = environment
self.config = self._load_config()
self.client: Optional[redis.Redis] = None
self._subprocess: Optional[subprocess.Popen] = None
self._health_check_enabled = True
logger.info(f"🔧 Redis Manager initialized for {environment.value} environment")
logger.info(f"📡 Target: {self.config.host}:{self.config.port}/db{self.config.db}")
def _load_config(self) -> RedisConfig:
"""Load environment-specific Redis configuration."""
# Base configuration
host = os.getenv('REDIS_HOST', 'localhost')
port = int(os.getenv('REDIS_PORT', '6379'))
password = os.getenv('REDIS_PASSWORD') or None
ssl = os.getenv('REDIS_SSL', 'false').lower() == 'true'
# Environment-specific settings
if self.environment == Environment.DEV:
db = int(os.getenv('REDIS_DB_DEV', '0'))
persist = os.getenv('REDIS_PERSIST_DEV', 'false').lower() == 'true'
task_ttl = int(os.getenv('REDIS_TASK_TTL_DEV', '3600'))
elif self.environment == Environment.PROD:
db = int(os.getenv('REDIS_DB_PROD', '1'))
persist = os.getenv('REDIS_PERSIST_PROD', 'true').lower() == 'true'
task_ttl = int(os.getenv('REDIS_TASK_TTL_PROD', '86400'))
else: # TEST
db = int(os.getenv('REDIS_DB_TEST', '2'))
persist = False
task_ttl = int(os.getenv('REDIS_TASK_TTL_TEST', '1800'))
# Construct URL
auth_part = f":{password}@" if password else ""
url = f"redis://{auth_part}{host}:{port}/{db}"
return RedisConfig(
host=host,
port=port,
db=db,
password=password,
ssl=ssl,
persist=persist,
task_ttl=task_ttl,
url=url
)
def ensure_service_running(self) -> bool:
"""Ensure Redis service is running, start if needed."""
# Check if Redis is already running
if self._is_redis_running():
logger.info("✅ Redis service already running")
return True
# Try to start Redis service
logger.info("🚀 Starting Redis service...")
# Try systemctl first (Linux production)
if self._try_systemctl_start():
return True
# Try brew services (macOS)
if self._try_brew_start():
return True
# Try direct Redis server start
if self._try_direct_start():
return True
# Try Docker fallback
if self._try_docker_start():
return True
logger.error("❌ Failed to start Redis service with all methods")
return False
def _is_redis_running(self) -> bool:
"""Check if Redis is accessible."""
try:
test_client = redis.Redis(
host=self.config.host,
port=self.config.port,
socket_connect_timeout=2,
socket_timeout=2
)
test_client.ping()
test_client.close()
return True
except Exception:
return False
def _try_systemctl_start(self) -> bool:
"""Try starting Redis with systemctl."""
try:
if not os.path.exists('/usr/bin/systemctl'):
return False
subprocess.run(['sudo', 'systemctl', 'start', 'redis'],
check=True, capture_output=True, timeout=10)
time.sleep(2)
return self._is_redis_running()
except Exception:
return False
def _try_brew_start(self) -> bool:
"""Try starting Redis with brew services."""
try:
if not os.path.exists('/opt/homebrew/bin/brew'):
return False
subprocess.run(['/opt/homebrew/bin/brew', 'services', 'start', 'redis'],
check=True, capture_output=True, timeout=10)
time.sleep(2)
return self._is_redis_running()
except Exception:
return False
def _try_direct_start(self) -> bool:
"""Try starting Redis server directly."""
try:
# Find redis-server binary
redis_cmd = None
for path in ['/opt/homebrew/bin/redis-server', '/usr/local/bin/redis-server', 'redis-server']:
try:
subprocess.run([path, '--version'], capture_output=True, check=True, timeout=5)
redis_cmd = path
break
except Exception:
continue
if not redis_cmd:
return False
# Start Redis with appropriate config
config_args = [
redis_cmd,
'--port', str(self.config.port),
'--bind', self.config.host,
'--protected-mode', 'no',
'--loglevel', 'notice',
'--daemonize', 'yes' # Run as daemon
]
# Add persistence settings
if not self.config.persist:
config_args.extend(['--save', '', '--appendonly', 'no'])
else:
config_args.extend(['--save', '60 1000', '--appendonly', 'yes'])
subprocess.run(config_args, check=True, capture_output=True, timeout=10)
time.sleep(3)
return self._is_redis_running()
except Exception as e:
logger.debug(f"Direct Redis start failed: {e}")
return False
def _try_docker_start(self) -> bool:
"""Try starting Redis with Docker."""
try:
subprocess.run(['docker', '--version'], capture_output=True, check=True, timeout=5)
# Check if Redis container already exists
result = subprocess.run(
['docker', 'ps', '-a', '--filter', 'name=classroomcopilot-redis', '--format', '{{.Names}}'],
capture_output=True, text=True, timeout=10
)
if 'classroomcopilot-redis' in result.stdout:
# Start existing container
subprocess.run(['docker', 'start', 'classroomcopilot-redis'],
check=True, capture_output=True, timeout=10)
else:
# Create new container
docker_cmd = [
'docker', 'run', '-d',
'--name', 'classroomcopilot-redis',
'-p', f'{self.config.port}:6379',
'redis:alpine'
]
if not self.config.persist:
docker_cmd.extend(['redis-server', '--save', '', '--appendonly', 'no'])
subprocess.run(docker_cmd, check=True, capture_output=True, timeout=30)
time.sleep(3)
return self._is_redis_running()
except Exception as e:
logger.debug(f"Docker Redis start failed: {e}")
return False
def connect(self) -> bool:
"""Establish Redis connection with retry logic."""
max_attempts = int(os.getenv('REDIS_MAX_RETRY_ATTEMPTS', '3'))
for attempt in range(1, max_attempts + 1):
try:
logger.info(f"🔌 Connecting to Redis... (attempt {attempt}/{max_attempts})")
self.client = redis.Redis(
host=self.config.host,
port=self.config.port,
db=self.config.db,
password=self.config.password,
decode_responses=True,
socket_connect_timeout=5,
socket_timeout=5,
retry_on_timeout=True
)
# Test connection
self.client.ping()
logger.info(f"✅ Connected to Redis {self.config.host}:{self.config.port}/db{self.config.db}")
return True
except Exception as e:
logger.warning(f"❌ Connection attempt {attempt} failed: {e}")
if attempt < max_attempts:
logger.info("⏳ Retrying in 2 seconds...")
time.sleep(2)
else:
logger.error("💥 All connection attempts failed")
return False
return False
def initialize_environment(self) -> bool:
"""Initialize Redis environment based on mode."""
if not self.ensure_service_running():
logger.error("Cannot initialize - Redis service not available")
return False
if not self.connect():
logger.error("Cannot initialize - Connection failed")
return False
if self.environment == Environment.DEV:
return self._initialize_dev_environment()
elif self.environment == Environment.PROD:
return self._initialize_prod_environment()
else:
return self._initialize_test_environment()
def _initialize_dev_environment(self) -> bool:
"""Initialize development environment - clean slate."""
try:
logger.info("🧹 DEV MODE: Clearing all data for clean startup...")
# Get all keys in this database
all_keys = self.client.keys('*')
if all_keys:
# Nuclear option - clear everything in this DB
self.client.flushdb()
logger.info(f"💥 DEV MODE: Nuked {len(all_keys)} keys for clean startup")
else:
logger.info("✅ DEV MODE: Database already clean")
# Set up development-specific config
try:
self.client.config_set('save', '') # Disable RDB snapshots
self.client.config_set('appendonly', 'no') # Disable AOF
except Exception as e:
logger.debug(f"Could not set Redis config (may not have permissions): {e}")
logger.info("🎯 DEV MODE: Environment initialized for fast iteration")
return True
except Exception as e:
logger.error(f"Failed to initialize dev environment: {e}")
return False
def _initialize_prod_environment(self) -> bool:
"""Initialize production environment - preserve data."""
try:
logger.info("🏭 PROD MODE: Initializing with data preservation...")
# Check for existing tasks and report
task_keys = self.client.keys('task:*')
queue_keys = self.client.keys('queue:*')
processing_keys = self.client.keys('processing:*')
logger.info(f"📊 PROD MODE: Found {len(task_keys)} tasks, {len(queue_keys)} queues, {len(processing_keys)} processing counters")
# Enable persistence
if self.config.persist:
try:
self.client.config_set('save', '60 1000') # Save every 60s if ≥1000 changes
self.client.config_set('appendonly', 'yes') # Enable AOF
logger.info("💾 PROD MODE: Persistence enabled")
except Exception as e:
logger.debug(f"Could not set Redis config (may not have permissions): {e}")
# Recover any stuck tasks
self._recover_stuck_tasks()
logger.info("✅ PROD MODE: Environment initialized with data recovery")
return True
except Exception as e:
logger.error(f"Failed to initialize prod environment: {e}")
return False
def _initialize_test_environment(self) -> bool:
"""Initialize test environment - isolated and clean."""
try:
logger.info("🧪 TEST MODE: Setting up isolated test environment...")
# Clear test database
self.client.flushdb()
# Disable persistence for speed
try:
self.client.config_set('save', '')
self.client.config_set('appendonly', 'no')
except Exception as e:
logger.debug(f"Could not set Redis config (may not have permissions): {e}")
logger.info("✅ TEST MODE: Clean, isolated environment ready")
return True
except Exception as e:
logger.error(f"Failed to initialize test environment: {e}")
return False
def _recover_stuck_tasks(self):
"""Recover tasks that were processing when system shut down."""
try:
logger.info("🔄 Recovering stuck tasks from previous session...")
# Get all processing tasks
processing_data = self.client.hgetall('processing')
if not processing_data:
logger.info("✅ No stuck tasks to recover")
return
recovered_count = 0
for task_id, task_info in processing_data.items():
try:
# Parse task info
info = json.loads(task_info)
# Check if task still exists
task_key = f"task:{task_id}"
if not self.client.exists(task_key):
logger.warning(f"⚠️ Task {task_id} data missing, removing from processing")
self.client.hdel('processing', task_id)
continue
# Reset task to pending and re-queue
self.client.hset(task_key, 'status', 'pending')
self.client.hdel(task_key, 'started_at')
# Add back to appropriate queue
priority = info.get('priority', 'normal')
queue_key = f"queue:{priority}"
self.client.lpush(queue_key, task_id)
# Remove from processing
self.client.hdel('processing', task_id)
# Reset service counters
service = info.get('service')
if service:
service_key = f"processing:{service}"
current_count = int(self.client.get(service_key) or 0)
if current_count > 0:
self.client.decr(service_key)
recovered_count += 1
logger.info(f"🔄 Recovered stuck task {task_id} ({service}/{priority})")
except Exception as e:
logger.error(f"Failed to recover task {task_id}: {e}")
# Remove problematic entry
self.client.hdel('processing', task_id)
logger.info(f"✅ Recovered {recovered_count} stuck tasks")
except Exception as e:
logger.error(f"Task recovery failed: {e}")
def health_check(self) -> Dict[str, Any]:
"""Comprehensive health check."""
health = {
'status': 'healthy',
'environment': self.environment.value,
'database': self.config.db,
'connection': False,
'memory_usage': None,
'queue_stats': {},
'error': None
}
try:
if not self.client:
raise Exception("No Redis connection")
# Test connection
self.client.ping()
health['connection'] = True
# Get memory usage
info = self.client.info('memory')
health['memory_usage'] = {
'used_memory_human': info.get('used_memory_human', 'unknown'),
'used_memory_peak_human': info.get('used_memory_peak_human', 'unknown')
}
# Get queue statistics
health['queue_stats'] = {
'total_keys': len(self.client.keys('*')),
'tasks': len(self.client.keys('task:*')),
'queues': {
'high': self.client.llen('queue:high'),
'normal': self.client.llen('queue:normal'),
'low': self.client.llen('queue:low')
},
'processing': self.client.hlen('processing'),
'dead_letter': self.client.llen('dead_letter')
}
except Exception as e:
health['status'] = 'unhealthy'
health['error'] = str(e)
return health
def shutdown(self, force: bool = False):
"""Graceful shutdown with optional data preservation."""
if self.environment == Environment.DEV and not force:
logger.info("🧹 DEV MODE: Clearing data on shutdown...")
try:
if self.client:
self.client.flushdb()
logger.info("✅ Dev data cleared")
except Exception as e:
logger.warning(f"Failed to clear dev data: {e}")
# Close connection
if self.client:
try:
self.client.close()
logger.info("🔌 Redis connection closed")
except Exception as e:
logger.warning(f"Error closing Redis connection: {e}")
# Stop subprocess if we started it
if self._subprocess:
try:
self._subprocess.terminate()
self._subprocess.wait(timeout=5)
logger.info("🛑 Redis subprocess stopped")
except Exception as e:
logger.warning(f"Error stopping Redis subprocess: {e}")
logger.info(f"✅ Redis manager shutdown complete ({self.environment.value})")
# Convenience functions for backward compatibility
def get_redis_manager(environment: str = "dev") -> RedisManager:
"""Get a Redis manager instance for the specified environment."""
env = Environment(environment.lower())
return RedisManager(env)
def ensure_redis_running(environment: str = "dev") -> bool:
"""Ensure Redis is running for the specified environment."""
manager = get_redis_manager(environment)
return manager.ensure_service_running()