524 lines
20 KiB
Python
524 lines
20 KiB
Python
"""
|
|
Comprehensive Redis Management System
|
|
=====================================
|
|
|
|
Handles environment-specific Redis configuration, service management,
|
|
task recovery, and health monitoring for ClassroomCopilot.
|
|
|
|
Features:
|
|
- Environment isolation (dev/prod/test databases)
|
|
- Automatic service management and health checks
|
|
- Task recovery and persistence strategies
|
|
- Graceful degradation and error handling
|
|
"""
|
|
|
|
import os
|
|
import redis
|
|
import subprocess
|
|
import time
|
|
import signal
|
|
import json
|
|
import logging
|
|
from typing import Dict, List, Optional, Any, Tuple
|
|
from dataclasses import dataclass
|
|
from enum import Enum
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class Environment(Enum):
|
|
DEV = "dev"
|
|
PROD = "prod"
|
|
TEST = "test"
|
|
|
|
@dataclass
|
|
class RedisConfig:
|
|
host: str
|
|
port: int
|
|
db: int
|
|
password: Optional[str]
|
|
ssl: bool
|
|
persist: bool
|
|
task_ttl: int
|
|
url: str
|
|
|
|
class RedisManager:
|
|
"""Comprehensive Redis management with environment isolation and recovery."""
|
|
|
|
def __init__(self, environment: Environment = Environment.DEV):
|
|
self.environment = environment
|
|
self.config = self._load_config()
|
|
self.client: Optional[redis.Redis] = None
|
|
self._subprocess: Optional[subprocess.Popen] = None
|
|
self._health_check_enabled = True
|
|
|
|
logger.info(f"🔧 Redis Manager initialized for {environment.value} environment")
|
|
logger.info(f"📡 Target: {self.config.host}:{self.config.port}/db{self.config.db}")
|
|
|
|
def _load_config(self) -> RedisConfig:
|
|
"""Load environment-specific Redis configuration."""
|
|
|
|
# Base configuration
|
|
host = os.getenv('REDIS_HOST', 'localhost')
|
|
port = int(os.getenv('REDIS_PORT', '6379'))
|
|
password = os.getenv('REDIS_PASSWORD') or None
|
|
ssl = os.getenv('REDIS_SSL', 'false').lower() == 'true'
|
|
|
|
# Environment-specific settings
|
|
if self.environment == Environment.DEV:
|
|
db = int(os.getenv('REDIS_DB_DEV', '0'))
|
|
persist = os.getenv('REDIS_PERSIST_DEV', 'false').lower() == 'true'
|
|
task_ttl = int(os.getenv('REDIS_TASK_TTL_DEV', '3600'))
|
|
elif self.environment == Environment.PROD:
|
|
db = int(os.getenv('REDIS_DB_PROD', '1'))
|
|
persist = os.getenv('REDIS_PERSIST_PROD', 'true').lower() == 'true'
|
|
task_ttl = int(os.getenv('REDIS_TASK_TTL_PROD', '86400'))
|
|
else: # TEST
|
|
db = int(os.getenv('REDIS_DB_TEST', '2'))
|
|
persist = False
|
|
task_ttl = int(os.getenv('REDIS_TASK_TTL_TEST', '1800'))
|
|
|
|
# Construct URL
|
|
auth_part = f":{password}@" if password else ""
|
|
url = f"redis://{auth_part}{host}:{port}/{db}"
|
|
|
|
return RedisConfig(
|
|
host=host,
|
|
port=port,
|
|
db=db,
|
|
password=password,
|
|
ssl=ssl,
|
|
persist=persist,
|
|
task_ttl=task_ttl,
|
|
url=url
|
|
)
|
|
|
|
def ensure_service_running(self) -> bool:
|
|
"""Ensure Redis service is running, start if needed."""
|
|
|
|
# Check if Redis is already running
|
|
if self._is_redis_running():
|
|
logger.info("✅ Redis service already running")
|
|
return True
|
|
|
|
# Try to start Redis service
|
|
logger.info("🚀 Starting Redis service...")
|
|
|
|
# Try systemctl first (Linux production)
|
|
if self._try_systemctl_start():
|
|
return True
|
|
|
|
# Try brew services (macOS)
|
|
if self._try_brew_start():
|
|
return True
|
|
|
|
# Try direct Redis server start
|
|
if self._try_direct_start():
|
|
return True
|
|
|
|
# Try Docker fallback
|
|
if self._try_docker_start():
|
|
return True
|
|
|
|
logger.error("❌ Failed to start Redis service with all methods")
|
|
return False
|
|
|
|
def _is_redis_running(self) -> bool:
|
|
"""Check if Redis is accessible."""
|
|
try:
|
|
test_client = redis.Redis(
|
|
host=self.config.host,
|
|
port=self.config.port,
|
|
socket_connect_timeout=2,
|
|
socket_timeout=2
|
|
)
|
|
test_client.ping()
|
|
test_client.close()
|
|
return True
|
|
except Exception:
|
|
return False
|
|
|
|
def _try_systemctl_start(self) -> bool:
|
|
"""Try starting Redis with systemctl."""
|
|
try:
|
|
if not os.path.exists('/usr/bin/systemctl'):
|
|
return False
|
|
|
|
subprocess.run(['sudo', 'systemctl', 'start', 'redis'],
|
|
check=True, capture_output=True, timeout=10)
|
|
time.sleep(2)
|
|
return self._is_redis_running()
|
|
except Exception:
|
|
return False
|
|
|
|
def _try_brew_start(self) -> bool:
|
|
"""Try starting Redis with brew services."""
|
|
try:
|
|
if not os.path.exists('/opt/homebrew/bin/brew'):
|
|
return False
|
|
|
|
subprocess.run(['/opt/homebrew/bin/brew', 'services', 'start', 'redis'],
|
|
check=True, capture_output=True, timeout=10)
|
|
time.sleep(2)
|
|
return self._is_redis_running()
|
|
except Exception:
|
|
return False
|
|
|
|
def _try_direct_start(self) -> bool:
|
|
"""Try starting Redis server directly."""
|
|
try:
|
|
# Find redis-server binary
|
|
redis_cmd = None
|
|
for path in ['/opt/homebrew/bin/redis-server', '/usr/local/bin/redis-server', 'redis-server']:
|
|
try:
|
|
subprocess.run([path, '--version'], capture_output=True, check=True, timeout=5)
|
|
redis_cmd = path
|
|
break
|
|
except Exception:
|
|
continue
|
|
|
|
if not redis_cmd:
|
|
return False
|
|
|
|
# Start Redis with appropriate config
|
|
config_args = [
|
|
redis_cmd,
|
|
'--port', str(self.config.port),
|
|
'--bind', self.config.host,
|
|
'--protected-mode', 'no',
|
|
'--loglevel', 'notice',
|
|
'--daemonize', 'yes' # Run as daemon
|
|
]
|
|
|
|
# Add persistence settings
|
|
if not self.config.persist:
|
|
config_args.extend(['--save', '', '--appendonly', 'no'])
|
|
else:
|
|
config_args.extend(['--save', '60 1000', '--appendonly', 'yes'])
|
|
|
|
subprocess.run(config_args, check=True, capture_output=True, timeout=10)
|
|
time.sleep(3)
|
|
return self._is_redis_running()
|
|
|
|
except Exception as e:
|
|
logger.debug(f"Direct Redis start failed: {e}")
|
|
return False
|
|
|
|
def _try_docker_start(self) -> bool:
|
|
"""Try starting Redis with Docker."""
|
|
try:
|
|
subprocess.run(['docker', '--version'], capture_output=True, check=True, timeout=5)
|
|
|
|
# Check if Redis container already exists
|
|
result = subprocess.run(
|
|
['docker', 'ps', '-a', '--filter', 'name=classroomcopilot-redis', '--format', '{{.Names}}'],
|
|
capture_output=True, text=True, timeout=10
|
|
)
|
|
|
|
if 'classroomcopilot-redis' in result.stdout:
|
|
# Start existing container
|
|
subprocess.run(['docker', 'start', 'classroomcopilot-redis'],
|
|
check=True, capture_output=True, timeout=10)
|
|
else:
|
|
# Create new container
|
|
docker_cmd = [
|
|
'docker', 'run', '-d',
|
|
'--name', 'classroomcopilot-redis',
|
|
'-p', f'{self.config.port}:6379',
|
|
'redis:alpine'
|
|
]
|
|
|
|
if not self.config.persist:
|
|
docker_cmd.extend(['redis-server', '--save', '', '--appendonly', 'no'])
|
|
|
|
subprocess.run(docker_cmd, check=True, capture_output=True, timeout=30)
|
|
|
|
time.sleep(3)
|
|
return self._is_redis_running()
|
|
|
|
except Exception as e:
|
|
logger.debug(f"Docker Redis start failed: {e}")
|
|
return False
|
|
|
|
def connect(self) -> bool:
|
|
"""Establish Redis connection with retry logic."""
|
|
max_attempts = int(os.getenv('REDIS_MAX_RETRY_ATTEMPTS', '3'))
|
|
|
|
for attempt in range(1, max_attempts + 1):
|
|
try:
|
|
logger.info(f"🔌 Connecting to Redis... (attempt {attempt}/{max_attempts})")
|
|
|
|
self.client = redis.Redis(
|
|
host=self.config.host,
|
|
port=self.config.port,
|
|
db=self.config.db,
|
|
password=self.config.password,
|
|
decode_responses=True,
|
|
socket_connect_timeout=5,
|
|
socket_timeout=5,
|
|
retry_on_timeout=True
|
|
)
|
|
|
|
# Test connection
|
|
self.client.ping()
|
|
|
|
logger.info(f"✅ Connected to Redis {self.config.host}:{self.config.port}/db{self.config.db}")
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.warning(f"❌ Connection attempt {attempt} failed: {e}")
|
|
if attempt < max_attempts:
|
|
logger.info("⏳ Retrying in 2 seconds...")
|
|
time.sleep(2)
|
|
else:
|
|
logger.error("💥 All connection attempts failed")
|
|
return False
|
|
|
|
return False
|
|
|
|
def initialize_environment(self) -> bool:
|
|
"""Initialize Redis environment based on mode."""
|
|
|
|
if not self.ensure_service_running():
|
|
logger.error("Cannot initialize - Redis service not available")
|
|
return False
|
|
|
|
if not self.connect():
|
|
logger.error("Cannot initialize - Connection failed")
|
|
return False
|
|
|
|
if self.environment == Environment.DEV:
|
|
return self._initialize_dev_environment()
|
|
elif self.environment == Environment.PROD:
|
|
return self._initialize_prod_environment()
|
|
else:
|
|
return self._initialize_test_environment()
|
|
|
|
def _initialize_dev_environment(self) -> bool:
|
|
"""Initialize development environment - clean slate."""
|
|
try:
|
|
logger.info("🧹 DEV MODE: Clearing all data for clean startup...")
|
|
|
|
# Get all keys in this database
|
|
all_keys = self.client.keys('*')
|
|
|
|
if all_keys:
|
|
# Nuclear option - clear everything in this DB
|
|
self.client.flushdb()
|
|
logger.info(f"💥 DEV MODE: Nuked {len(all_keys)} keys for clean startup")
|
|
else:
|
|
logger.info("✅ DEV MODE: Database already clean")
|
|
|
|
# Set up development-specific config
|
|
try:
|
|
self.client.config_set('save', '') # Disable RDB snapshots
|
|
self.client.config_set('appendonly', 'no') # Disable AOF
|
|
except Exception as e:
|
|
logger.debug(f"Could not set Redis config (may not have permissions): {e}")
|
|
|
|
logger.info("🎯 DEV MODE: Environment initialized for fast iteration")
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to initialize dev environment: {e}")
|
|
return False
|
|
|
|
def _initialize_prod_environment(self) -> bool:
|
|
"""Initialize production environment - preserve data."""
|
|
try:
|
|
logger.info("🏭 PROD MODE: Initializing with data preservation...")
|
|
|
|
# Check for existing tasks and report
|
|
task_keys = self.client.keys('task:*')
|
|
queue_keys = self.client.keys('queue:*')
|
|
processing_keys = self.client.keys('processing:*')
|
|
|
|
logger.info(f"📊 PROD MODE: Found {len(task_keys)} tasks, {len(queue_keys)} queues, {len(processing_keys)} processing counters")
|
|
|
|
# Enable persistence
|
|
if self.config.persist:
|
|
try:
|
|
self.client.config_set('save', '60 1000') # Save every 60s if ≥1000 changes
|
|
self.client.config_set('appendonly', 'yes') # Enable AOF
|
|
logger.info("💾 PROD MODE: Persistence enabled")
|
|
except Exception as e:
|
|
logger.debug(f"Could not set Redis config (may not have permissions): {e}")
|
|
|
|
# Recover any stuck tasks
|
|
self._recover_stuck_tasks()
|
|
|
|
logger.info("✅ PROD MODE: Environment initialized with data recovery")
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to initialize prod environment: {e}")
|
|
return False
|
|
|
|
def _initialize_test_environment(self) -> bool:
|
|
"""Initialize test environment - isolated and clean."""
|
|
try:
|
|
logger.info("🧪 TEST MODE: Setting up isolated test environment...")
|
|
|
|
# Clear test database
|
|
self.client.flushdb()
|
|
|
|
# Disable persistence for speed
|
|
try:
|
|
self.client.config_set('save', '')
|
|
self.client.config_set('appendonly', 'no')
|
|
except Exception as e:
|
|
logger.debug(f"Could not set Redis config (may not have permissions): {e}")
|
|
|
|
logger.info("✅ TEST MODE: Clean, isolated environment ready")
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to initialize test environment: {e}")
|
|
return False
|
|
|
|
def _recover_stuck_tasks(self):
|
|
"""Recover tasks that were processing when system shut down."""
|
|
try:
|
|
logger.info("🔄 Recovering stuck tasks from previous session...")
|
|
|
|
# Get all processing tasks
|
|
processing_data = self.client.hgetall('processing')
|
|
|
|
if not processing_data:
|
|
logger.info("✅ No stuck tasks to recover")
|
|
return
|
|
|
|
recovered_count = 0
|
|
|
|
for task_id, task_info in processing_data.items():
|
|
try:
|
|
# Parse task info
|
|
info = json.loads(task_info)
|
|
|
|
# Check if task still exists
|
|
task_key = f"task:{task_id}"
|
|
if not self.client.exists(task_key):
|
|
logger.warning(f"⚠️ Task {task_id} data missing, removing from processing")
|
|
self.client.hdel('processing', task_id)
|
|
continue
|
|
|
|
# Reset task to pending and re-queue
|
|
self.client.hset(task_key, 'status', 'pending')
|
|
self.client.hdel(task_key, 'started_at')
|
|
|
|
# Add back to appropriate queue
|
|
priority = info.get('priority', 'normal')
|
|
queue_key = f"queue:{priority}"
|
|
self.client.lpush(queue_key, task_id)
|
|
|
|
# Remove from processing
|
|
self.client.hdel('processing', task_id)
|
|
|
|
# Reset service counters
|
|
service = info.get('service')
|
|
if service:
|
|
service_key = f"processing:{service}"
|
|
current_count = int(self.client.get(service_key) or 0)
|
|
if current_count > 0:
|
|
self.client.decr(service_key)
|
|
|
|
recovered_count += 1
|
|
logger.info(f"🔄 Recovered stuck task {task_id} ({service}/{priority})")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to recover task {task_id}: {e}")
|
|
# Remove problematic entry
|
|
self.client.hdel('processing', task_id)
|
|
|
|
logger.info(f"✅ Recovered {recovered_count} stuck tasks")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Task recovery failed: {e}")
|
|
|
|
def health_check(self) -> Dict[str, Any]:
|
|
"""Comprehensive health check."""
|
|
health = {
|
|
'status': 'healthy',
|
|
'environment': self.environment.value,
|
|
'database': self.config.db,
|
|
'connection': False,
|
|
'memory_usage': None,
|
|
'queue_stats': {},
|
|
'error': None
|
|
}
|
|
|
|
try:
|
|
if not self.client:
|
|
raise Exception("No Redis connection")
|
|
|
|
# Test connection
|
|
self.client.ping()
|
|
health['connection'] = True
|
|
|
|
# Get memory usage
|
|
info = self.client.info('memory')
|
|
health['memory_usage'] = {
|
|
'used_memory_human': info.get('used_memory_human', 'unknown'),
|
|
'used_memory_peak_human': info.get('used_memory_peak_human', 'unknown')
|
|
}
|
|
|
|
# Get queue statistics
|
|
health['queue_stats'] = {
|
|
'total_keys': len(self.client.keys('*')),
|
|
'tasks': len(self.client.keys('task:*')),
|
|
'queues': {
|
|
'high': self.client.llen('queue:high'),
|
|
'normal': self.client.llen('queue:normal'),
|
|
'low': self.client.llen('queue:low')
|
|
},
|
|
'processing': self.client.hlen('processing'),
|
|
'dead_letter': self.client.llen('dead_letter')
|
|
}
|
|
|
|
except Exception as e:
|
|
health['status'] = 'unhealthy'
|
|
health['error'] = str(e)
|
|
|
|
return health
|
|
|
|
def shutdown(self, force: bool = False):
|
|
"""Graceful shutdown with optional data preservation."""
|
|
|
|
if self.environment == Environment.DEV and not force:
|
|
logger.info("🧹 DEV MODE: Clearing data on shutdown...")
|
|
try:
|
|
if self.client:
|
|
self.client.flushdb()
|
|
logger.info("✅ Dev data cleared")
|
|
except Exception as e:
|
|
logger.warning(f"Failed to clear dev data: {e}")
|
|
|
|
# Close connection
|
|
if self.client:
|
|
try:
|
|
self.client.close()
|
|
logger.info("🔌 Redis connection closed")
|
|
except Exception as e:
|
|
logger.warning(f"Error closing Redis connection: {e}")
|
|
|
|
# Stop subprocess if we started it
|
|
if self._subprocess:
|
|
try:
|
|
self._subprocess.terminate()
|
|
self._subprocess.wait(timeout=5)
|
|
logger.info("🛑 Redis subprocess stopped")
|
|
except Exception as e:
|
|
logger.warning(f"Error stopping Redis subprocess: {e}")
|
|
|
|
logger.info(f"✅ Redis manager shutdown complete ({self.environment.value})")
|
|
|
|
# Convenience functions for backward compatibility
|
|
def get_redis_manager(environment: str = "dev") -> RedisManager:
|
|
"""Get a Redis manager instance for the specified environment."""
|
|
env = Environment(environment.lower())
|
|
return RedisManager(env)
|
|
|
|
def ensure_redis_running(environment: str = "dev") -> bool:
|
|
"""Ensure Redis is running for the specified environment."""
|
|
manager = get_redis_manager(environment)
|
|
return manager.ensure_service_running()
|