api/modules/redis_manager.py

"""
Comprehensive Redis Management System
=====================================

Handles environment-specific Redis configuration, service management,
task recovery, and health monitoring for ClassroomCopilot.

Features:
- Environment isolation (dev/prod/test databases)
- Automatic service management and health checks
- Task recovery and persistence strategies
- Graceful degradation and error handling
"""

import os
import redis
import subprocess
import time
import signal
import json
import logging
from typing import Dict, List, Optional, Any, Tuple
from dataclasses import dataclass
from enum import Enum

logger = logging.getLogger(__name__)

class Environment(Enum):
    DEV = "dev"
    PROD = "prod"
    TEST = "test"

@dataclass
class RedisConfig:
    host: str
    port: int
    db: int
    password: Optional[str]
    ssl: bool
    persist: bool
    task_ttl: int
    url: str

class RedisManager:
    """Comprehensive Redis management with environment isolation and recovery."""

    def __init__(self, environment: Environment = Environment.DEV):
        self.environment = environment
        self.config = self._load_config()
        self.client: Optional[redis.Redis] = None
        self._subprocess: Optional[subprocess.Popen] = None
        self._health_check_enabled = True

        logger.info(f"🔧 Redis Manager initialized for {environment.value} environment")
        logger.info(f"📡 Target: {self.config.host}:{self.config.port}/db{self.config.db}")

    def _load_config(self) -> RedisConfig:
        """Load environment-specific Redis configuration."""

        # Base configuration
        host = os.getenv('REDIS_HOST', 'localhost')
        port = int(os.getenv('REDIS_PORT', '6379'))
        password = os.getenv('REDIS_PASSWORD') or None
        ssl = os.getenv('REDIS_SSL', 'false').lower() == 'true'

        # Environment-specific settings
        if self.environment == Environment.DEV:
            db = int(os.getenv('REDIS_DB_DEV', '0'))
            persist = os.getenv('REDIS_PERSIST_DEV', 'false').lower() == 'true'
            task_ttl = int(os.getenv('REDIS_TASK_TTL_DEV', '3600'))
        elif self.environment == Environment.PROD:
            db = int(os.getenv('REDIS_DB_PROD', '1'))
            persist = os.getenv('REDIS_PERSIST_PROD', 'true').lower() == 'true'
            task_ttl = int(os.getenv('REDIS_TASK_TTL_PROD', '86400'))
        else:  # TEST
            db = int(os.getenv('REDIS_DB_TEST', '2'))
            persist = False
            task_ttl = int(os.getenv('REDIS_TASK_TTL_TEST', '1800'))

        # Construct URL
        auth_part = f":{password}@" if password else ""
        url = f"redis://{auth_part}{host}:{port}/{db}"

        return RedisConfig(
            host=host,
            port=port,
            db=db,
            password=password,
            ssl=ssl,
            persist=persist,
            task_ttl=task_ttl,
            url=url
        )

    def ensure_service_running(self) -> bool:
        """Ensure Redis service is running, start if needed."""

        # Check if Redis is already running
        if self._is_redis_running():
            logger.info("✅ Redis service already running")
            return True

        # Try to start Redis service
        logger.info("🚀 Starting Redis service...")

        # Try systemctl first (Linux production)
        if self._try_systemctl_start():
            return True

        # Try brew services (macOS)
        if self._try_brew_start():
            return True

        # Try direct Redis server start
        if self._try_direct_start():
            return True

        # Try Docker fallback
        if self._try_docker_start():
            return True

        logger.error("❌ Failed to start Redis service with all methods")
        return False

    def _is_redis_running(self) -> bool:
        """Check if Redis is accessible."""
        try:
            test_client = redis.Redis(
                host=self.config.host,
                port=self.config.port,
                socket_connect_timeout=2,
                socket_timeout=2
            )
            test_client.ping()
            test_client.close()
            return True
        except Exception:
            return False

    def _try_systemctl_start(self) -> bool:
        """Try starting Redis with systemctl."""
        try:
            if not os.path.exists('/usr/bin/systemctl'):
                return False

            subprocess.run(['sudo', 'systemctl', 'start', 'redis'],
                         check=True, capture_output=True, timeout=10)
            time.sleep(2)
            return self._is_redis_running()
        except Exception:
            return False

    def _try_brew_start(self) -> bool:
        """Try starting Redis with brew services."""
        try:
            if not os.path.exists('/opt/homebrew/bin/brew'):
                return False

            subprocess.run(['/opt/homebrew/bin/brew', 'services', 'start', 'redis'],
                         check=True, capture_output=True, timeout=10)
            time.sleep(2)
            return self._is_redis_running()
        except Exception:
            return False

    def _try_direct_start(self) -> bool:
        """Try starting Redis server directly."""
        try:
            # Find redis-server binary
            redis_cmd = None
            for path in ['/opt/homebrew/bin/redis-server', '/usr/local/bin/redis-server', 'redis-server']:
                try:
                    subprocess.run([path, '--version'], capture_output=True, check=True, timeout=5)
                    redis_cmd = path
                    break
                except Exception:
                    continue

            if not redis_cmd:
                return False

            # Start Redis with appropriate config
            config_args = [
                redis_cmd,
                '--port', str(self.config.port),
                '--bind', self.config.host,
                '--protected-mode', 'no',
                '--loglevel', 'notice',
                '--daemonize', 'yes'  # Run as daemon
            ]

            # Add persistence settings
            if not self.config.persist:
                config_args.extend(['--save', '', '--appendonly', 'no'])
            else:
                config_args.extend(['--save', '60 1000', '--appendonly', 'yes'])

            subprocess.run(config_args, check=True, capture_output=True, timeout=10)
            time.sleep(3)
            return self._is_redis_running()

        except Exception as e:
            logger.debug(f"Direct Redis start failed: {e}")
            return False

    def _try_docker_start(self) -> bool:
        """Try starting Redis with Docker."""
        try:
            subprocess.run(['docker', '--version'], capture_output=True, check=True, timeout=5)

            # Check if Redis container already exists
            result = subprocess.run(
                ['docker', 'ps', '-a', '--filter', 'name=classroomcopilot-redis', '--format', '{{.Names}}'],
                capture_output=True, text=True, timeout=10
            )

            if 'classroomcopilot-redis' in result.stdout:
                # Start existing container
                subprocess.run(['docker', 'start', 'classroomcopilot-redis'],
                             check=True, capture_output=True, timeout=10)
            else:
                # Create new container
                docker_cmd = [
                    'docker', 'run', '-d',
                    '--name', 'classroomcopilot-redis',
                    '-p', f'{self.config.port}:6379',
                    'redis:alpine'
                ]

                if not self.config.persist:
                    docker_cmd.extend(['redis-server', '--save', '', '--appendonly', 'no'])

                subprocess.run(docker_cmd, check=True, capture_output=True, timeout=30)

            time.sleep(3)
            return self._is_redis_running()

        except Exception as e:
            logger.debug(f"Docker Redis start failed: {e}")
            return False

    def connect(self) -> bool:
        """Establish Redis connection with retry logic."""
        max_attempts = int(os.getenv('REDIS_MAX_RETRY_ATTEMPTS', '3'))

        for attempt in range(1, max_attempts + 1):
            try:
                logger.info(f"🔌 Connecting to Redis... (attempt {attempt}/{max_attempts})")

                self.client = redis.Redis(
                    host=self.config.host,
                    port=self.config.port,
                    db=self.config.db,
                    password=self.config.password,
                    decode_responses=True,
                    socket_connect_timeout=5,
                    socket_timeout=5,
                    retry_on_timeout=True
                )

                # Test connection
                self.client.ping()

                logger.info(f"✅ Connected to Redis {self.config.host}:{self.config.port}/db{self.config.db}")
                return True

            except Exception as e:
                logger.warning(f"❌ Connection attempt {attempt} failed: {e}")
                if attempt < max_attempts:
                    logger.info("⏳ Retrying in 2 seconds...")
                    time.sleep(2)
                else:
                    logger.error("💥 All connection attempts failed")
                    return False

        return False

    def initialize_environment(self) -> bool:
        """Initialize Redis environment based on mode."""

        if not self.ensure_service_running():
            logger.error("Cannot initialize - Redis service not available")
            return False

        if not self.connect():
            logger.error("Cannot initialize - Connection failed")
            return False

        if self.environment == Environment.DEV:
            return self._initialize_dev_environment()
        elif self.environment == Environment.PROD:
            return self._initialize_prod_environment()
        else:
            return self._initialize_test_environment()

    def _initialize_dev_environment(self) -> bool:
        """Initialize development environment - clean slate."""
        try:
            logger.info("🧹 DEV MODE: Clearing all data for clean startup...")

            # Get all keys in this database
            all_keys = self.client.keys('*')

            if all_keys:
                # Nuclear option - clear everything in this DB
                self.client.flushdb()
                logger.info(f"💥 DEV MODE: Nuked {len(all_keys)} keys for clean startup")
            else:
                logger.info("✅ DEV MODE: Database already clean")

            # Set up development-specific config
            try:
                self.client.config_set('save', '')  # Disable RDB snapshots
                self.client.config_set('appendonly', 'no')  # Disable AOF
            except Exception as e:
                logger.debug(f"Could not set Redis config (may not have permissions): {e}")

            logger.info("🎯 DEV MODE: Environment initialized for fast iteration")
            return True

        except Exception as e:
            logger.error(f"Failed to initialize dev environment: {e}")
            return False

    def _initialize_prod_environment(self) -> bool:
        """Initialize production environment - preserve data."""
        try:
            logger.info("🏭 PROD MODE: Initializing with data preservation...")

            # Check for existing tasks and report
            task_keys = self.client.keys('task:*')
            queue_keys = self.client.keys('queue:*')
            processing_keys = self.client.keys('processing:*')

            logger.info(f"📊 PROD MODE: Found {len(task_keys)} tasks, {len(queue_keys)} queues, {len(processing_keys)} processing counters")

            # Enable persistence
            if self.config.persist:
                try:
                    self.client.config_set('save', '60 1000')  # Save every 60s if ≥1000 changes
                    self.client.config_set('appendonly', 'yes')  # Enable AOF
                    logger.info("💾 PROD MODE: Persistence enabled")
                except Exception as e:
                    logger.debug(f"Could not set Redis config (may not have permissions): {e}")

            # Recover any stuck tasks
            self._recover_stuck_tasks()

            logger.info("✅ PROD MODE: Environment initialized with data recovery")
            return True

        except Exception as e:
            logger.error(f"Failed to initialize prod environment: {e}")
            return False

    def _initialize_test_environment(self) -> bool:
        """Initialize test environment - isolated and clean."""
        try:
            logger.info("🧪 TEST MODE: Setting up isolated test environment...")

            # Clear test database
            self.client.flushdb()

            # Disable persistence for speed
            try:
                self.client.config_set('save', '')
                self.client.config_set('appendonly', 'no')
            except Exception as e:
                logger.debug(f"Could not set Redis config (may not have permissions): {e}")

            logger.info("✅ TEST MODE: Clean, isolated environment ready")
            return True

        except Exception as e:
            logger.error(f"Failed to initialize test environment: {e}")
            return False

    def _recover_stuck_tasks(self):
        """Recover tasks that were processing when system shut down."""
        try:
            logger.info("🔄 Recovering stuck tasks from previous session...")

            # Get all processing tasks
            processing_data = self.client.hgetall('processing')

            if not processing_data:
                logger.info("✅ No stuck tasks to recover")
                return

            recovered_count = 0

            for task_id, task_info in processing_data.items():
                try:
                    # Parse task info
                    info = json.loads(task_info)

                    # Check if task still exists
                    task_key = f"task:{task_id}"
                    if not self.client.exists(task_key):
                        logger.warning(f"⚠️  Task {task_id} data missing, removing from processing")
                        self.client.hdel('processing', task_id)
                        continue

                    # Reset task to pending and re-queue
                    self.client.hset(task_key, 'status', 'pending')
                    self.client.hdel(task_key, 'started_at')

                    # Add back to appropriate queue
                    priority = info.get('priority', 'normal')
                    queue_key = f"queue:{priority}"
                    self.client.lpush(queue_key, task_id)

                    # Remove from processing
                    self.client.hdel('processing', task_id)

                    # Reset service counters
                    service = info.get('service')
                    if service:
                        service_key = f"processing:{service}"
                        current_count = int(self.client.get(service_key) or 0)
                        if current_count > 0:
                            self.client.decr(service_key)

                    recovered_count += 1
                    logger.info(f"🔄 Recovered stuck task {task_id} ({service}/{priority})")

                except Exception as e:
                    logger.error(f"Failed to recover task {task_id}: {e}")
                    # Remove problematic entry
                    self.client.hdel('processing', task_id)

            logger.info(f"✅ Recovered {recovered_count} stuck tasks")

        except Exception as e:
            logger.error(f"Task recovery failed: {e}")

    def health_check(self) -> Dict[str, Any]:
        """Comprehensive health check."""
        health = {
            'status': 'healthy',
            'environment': self.environment.value,
            'database': self.config.db,
            'connection': False,
            'memory_usage': None,
            'queue_stats': {},
            'error': None
        }

        try:
            if not self.client:
                raise Exception("No Redis connection")

            # Test connection
            self.client.ping()
            health['connection'] = True

            # Get memory usage
            info = self.client.info('memory')
            health['memory_usage'] = {
                'used_memory_human': info.get('used_memory_human', 'unknown'),
                'used_memory_peak_human': info.get('used_memory_peak_human', 'unknown')
            }

            # Get queue statistics
            health['queue_stats'] = {
                'total_keys': len(self.client.keys('*')),
                'tasks': len(self.client.keys('task:*')),
                'queues': {
                    'high': self.client.llen('queue:high'),
                    'normal': self.client.llen('queue:normal'),
                    'low': self.client.llen('queue:low')
                },
                'processing': self.client.hlen('processing'),
                'dead_letter': self.client.llen('dead_letter')
            }

        except Exception as e:
            health['status'] = 'unhealthy'
            health['error'] = str(e)

        return health

    def shutdown(self, force: bool = False):
        """Graceful shutdown with optional data preservation."""

        if self.environment == Environment.DEV and not force:
            logger.info("🧹 DEV MODE: Clearing data on shutdown...")
            try:
                if self.client:
                    self.client.flushdb()
                    logger.info("✅ Dev data cleared")
            except Exception as e:
                logger.warning(f"Failed to clear dev data: {e}")

        # Close connection
        if self.client:
            try:
                self.client.close()
                logger.info("🔌 Redis connection closed")
            except Exception as e:
                logger.warning(f"Error closing Redis connection: {e}")

        # Stop subprocess if we started it
        if self._subprocess:
            try:
                self._subprocess.terminate()
                self._subprocess.wait(timeout=5)
                logger.info("🛑 Redis subprocess stopped")
            except Exception as e:
                logger.warning(f"Error stopping Redis subprocess: {e}")

        logger.info(f"✅ Redis manager shutdown complete ({self.environment.value})")

# Convenience functions for backward compatibility
def get_redis_manager(environment: str = "dev") -> RedisManager:
    """Get a Redis manager instance for the specified environment."""
    env = Environment(environment.lower())
    return RedisManager(env)

def ensure_redis_running(environment: str = "dev") -> bool:
    """Ensure Redis is running for the specified environment."""
    manager = get_redis_manager(environment)
    return manager.ensure_service_running()