api/run/initialization/gais_data.py

"""
GAIS Data Import Module for ClassroomCopilot
Handles import of publicly available school databases into Neo4j
Starting with Edubase All Data
"""
import os
import csv
import time
from typing import Dict, Any, List, Optional, Set, Tuple
from datetime import datetime
from modules.logger_tool import initialise_logger
from modules.database.services.neo4j_service import Neo4jService
from modules.database.tools.neo4j_session_tools import create_relationship

logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True)

def create_node_direct(session, label, properties):
    """Create a node and return it directly without using transaction ID lookup"""
    try:
        query = f"""
        CREATE (n:{label} $properties)
        RETURN n
        """
        result = session.run(query, properties=properties)
        record = result.single()
        if record:
            return record["n"]
        return None
    except Exception as e:
        logger.error(f"Error creating {label} node: {str(e)}")
        return None

class GAISDataImporter:
    """Handles import of publicly available school databases into Neo4j"""

    def __init__(self):
        self.neo4j_service = Neo4jService()
        self.db_name = os.getenv("NEO4J_GAIS_DATA", "gaisdata")  # Use dedicated database for GAIS data
        self.import_dir = os.path.join(os.path.dirname(__file__), "import")

        # Track created nodes to avoid duplicates
        self.created_nodes = {
            'LocalAuthority': set(),
            'EstablishmentType': set(),
            'EstablishmentTypeGroup': set(),
            'EstablishmentStatus': set(),
            'PhaseOfEducation': set(),
            'BoarderType': set(),
            'GenderType': set(),
            'ReligiousCharacter': set(),
            'Diocese': set(),
            'AdmissionsPolicy': set(),
            'SpecialClasses': set(),
            'TrustSchoolFlag': set(),
            'FederationFlag': set(),
            'Country': set(),
            'County': set(),
            'Town': set(),
            'Locality': set(),
            'GovernmentOfficeRegion': set(),
            'DistrictAdministrative': set(),
            'AdministrativeWard': set(),
            'ParliamentaryConstituency': set(),
            'UrbanRural': set(),
            'Inspectorate': set(),
            'QAB': set(),
            'FurtherEducationType': set(),
            'SixthForm': set()
        }

        # Track created relationships
        self.created_relationships = set()

        # Batch processing
        self.batch_size = 1000
        self.current_batch = []

        # Ensure the GAIS database exists
        self._ensure_database_exists()

    def _ensure_database_exists(self) -> None:
        """Ensure the GAIS database exists, create if it doesn't"""
        try:
            # Check if database exists
            with self.neo4j_service.driver.session() as session:
                result = session.run("SHOW DATABASES")
                databases = [record["name"] for record in result]

                if self.db_name not in databases:
                    logger.info(f"Creating database '{self.db_name}' for GAIS data...")
                    # Create the database
                    session.run(f"CREATE DATABASE {self.db_name}")
                    logger.info(f"Database '{self.db_name}' created successfully")
                else:
                    logger.info(f"Database '{self.db_name}' already exists")

        except Exception as e:
            logger.error(f"Error ensuring database exists: {str(e)}")
            raise

    def import_edubase_data(self, test_mode=False) -> Dict[str, Any]:
        """Import Edubase All Data into Neo4j"""
        logger.info("Starting Edubase data import...")

        edubase_file = os.path.join(self.import_dir, "edubasealldata20250828.csv")

        if not os.path.exists(edubase_file):
            return {
                "success": False,
                "message": f"Edubase file not found: {edubase_file}"
            }

        try:
            start_time = time.time()

            # Try different encodings to handle potential encoding issues
            encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
            working_encoding = None

            for encoding in encodings_to_try:
                try:
                    logger.info(f"Trying to read CSV with {encoding} encoding...")
                    with open(edubase_file, 'r', encoding=encoding) as file:
                        csv_reader = csv.DictReader(file)
                        # Test reading the first row to verify encoding works
                        first_row = next(csv_reader)
                        logger.info(f"Successfully read CSV with {encoding} encoding")
                        working_encoding = encoding
                        break
                except UnicodeDecodeError as e:
                    logger.warning(f"Failed to read with {encoding} encoding: {str(e)}")
                    continue
                except Exception as e:
                    logger.warning(f"Unexpected error with {encoding} encoding: {str(e)}")
                    continue

            if working_encoding is None:
                return {
                    "success": False,
                    "message": "Failed to read CSV file with any supported encoding"
                }

            # Now read the file with the working encoding and collect all data
            all_nodes = []  # List of (label, properties) tuples
            all_relationships = []  # List of (rel_type, start_key, end_key) tuples

            # Track unique relationships to avoid duplicates during collection
            unique_relationships = set()

            with open(edubase_file, 'r', encoding=working_encoding) as file:
                csv_reader = csv.DictReader(file)

                # Process headers and create schema
                self._process_headers(csv_reader.fieldnames)

                # Process data rows and collect all nodes and relationships
                total_rows = 0
                for row in csv_reader:
                    nodes, relationships = self._process_edubase_row(row)
                    all_nodes.extend(nodes)

                    # Only add relationships that haven't been seen before
                    for rel in relationships:
                        if rel not in unique_relationships:
                            all_relationships.append(rel)
                            unique_relationships.add(rel)

                    total_rows += 1

                    # In test mode, only process first 100 rows
                    if test_mode and total_rows >= 100:
                        break

                    if total_rows % 1000 == 0:
                        logger.info(f"Collected data from {total_rows} rows...")

            logger.info(f"Collected {len(all_nodes)} nodes and {len(all_relationships)} unique relationships from {total_rows} rows")

            # Now create all nodes first
            logger.info("Creating all nodes...")
            node_map = self._create_all_nodes(all_nodes)

            # Then create all relationships
            logger.info("Creating all relationships...")
            relationships_created = self._create_all_relationships(all_relationships, node_map)

        except Exception as e:
            logger.error(f"Error importing Edubase data: {str(e)}")
            return {
                "success": False,
                "message": f"Error importing Edubase data: {str(e)}"
            }

        end_time = time.time()
        processing_time = end_time - start_time

        logger.info(f"Edubase data import completed successfully!")
        logger.info(f"Total rows processed: {total_rows}")
        logger.info(f"Processing time: {processing_time:.2f} seconds")

        return {
            "success": True,
            "message": f"Successfully imported {total_rows} Edubase records",
            "total_rows": total_rows,
            "processing_time": processing_time,
            "nodes_created": {k: len(v) for k, v in self.created_nodes.items()},
            "relationships_created": relationships_created
        }

    def import_edubase_data_simple(self, test_mode=False) -> Dict[str, Any]:
        """Simple import approach - create relationships immediately when nodes are created"""
        logger.info("Starting simple Edubase data import...")

        edubase_file = os.path.join(self.import_dir, "edubasealldata20250828.csv")

        if not os.path.exists(edubase_file):
            return {
                "success": False,
                "message": f"Edubase file not found: {edubase_file}"
            }

        try:
            start_time = time.time()

            # Try different encodings to handle potential encoding issues
            encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
            working_encoding = None

            for encoding in encodings_to_try:
                try:
                    with open(edubase_file, 'r', encoding=encoding) as file:
                        csv_reader = csv.DictReader(file)
                        # Just read the first row to test
                        next(csv_reader)
                        working_encoding = encoding
                        break
                except UnicodeDecodeError:
                    continue

            if not working_encoding:
                return {
                    "success": False,
                    "message": "Could not determine file encoding"
                }

            logger.info(f"Using encoding: {working_encoding}")

            # Process the CSV file
            relationships_created = 0
            nodes_created = {
                'LocalAuthority': 0, 'EstablishmentType': 0, 'EstablishmentTypeGroup': 0,
                'EstablishmentStatus': 0, 'PhaseOfEducation': 0, 'GenderType': 0,
                'ReligiousCharacter': 0, 'Diocese': 0, 'Country': 0, 'County': 0,
                'Town': 0, 'Locality': 0, 'GovernmentOfficeRegion': 0,
                'DistrictAdministrative': 0, 'SpecialClasses': 0, 'SixthForm': 0
            }

            with open(edubase_file, 'r', encoding=working_encoding) as file:
                csv_reader = csv.DictReader(file)

                total_rows = 0
                for row in csv_reader:
                    # Create establishment and related nodes with relationships in same transaction
                    with self.neo4j_service.driver.session(database=self.db_name) as session:
                        with session.begin_transaction() as tx:
                            # Create establishment node
                            establishment_props = self._extract_establishment_properties(row)
                            if not establishment_props:
                                continue

                            # Create establishment
                            create_est_query = """
                            CREATE (e:Establishment $props)
                            RETURN e
                            """
                            est_result = tx.run(create_est_query, props=establishment_props)
                            establishment_node = est_result.single()["e"]

                            # Create related nodes and relationships immediately
                            rel_count = self._create_related_nodes_immediate(tx, establishment_node, row)
                            relationships_created += rel_count

                    total_rows += 1
                    if test_mode and total_rows >= 100:
                        break

                    if total_rows % 1000 == 0:
                        logger.info(f"Processed {total_rows} rows...")

            processing_time = time.time() - start_time

            logger.info(f"Simple Edubase data import completed successfully!")
            logger.info(f"Total rows processed: {total_rows}")
            logger.info(f"Processing time: {processing_time:.2f} seconds")

            return {
                "success": True,
                "message": f"Successfully imported {total_rows} Edubase records",
                "total_rows": total_rows,
                "processing_time": processing_time,
                "nodes_created": nodes_created,
                "relationships_created": relationships_created
            }

        except Exception as e:
            logger.error(f"Error in simple Edubase data import: {str(e)}")
            return {
                "success": False,
                "message": f"Error in simple Edubase data import: {str(e)}"
            }

    def _create_related_nodes_immediate(self, tx, establishment_node, row):
        """Create related nodes and relationships immediately in the same transaction"""
        relationships_created = 0

        # Local Authority
        la_code = row.get('LA (code)', '').strip()
        la_name = row.get('LA (name)', '').strip()
        if la_code and la_name and la_name != 'Not applicable':
            # Create or find local authority
            la_query = """
            MERGE (la:LocalAuthority {code: $code, name: $name})
            WITH la
            MATCH (e:Establishment) WHERE e.urn = $urn
            MERGE (e)-[r:IS_CONTROLLED_BY_LOCAL_AUTHORITY]->(la)
            RETURN r
            """
            result = tx.run(la_query, code=la_code, name=la_name, urn=establishment_node['urn'])
            if result.single():
                relationships_created += 1

        # Establishment Type
        type_code = row.get('TypeOfEstablishment (code)', '').strip()
        type_name = row.get('TypeOfEstablishment (name)', '').strip()
        if type_code and type_name and type_name != 'Not applicable':
            type_query = """
            MERGE (et:EstablishmentType {code: $code, name: $name})
            WITH et
            MATCH (e:Establishment) WHERE e.urn = $urn
            MERGE (e)-[r:IS_ESTABLISHMENT_TYPE]->(et)
            RETURN r
            """
            result = tx.run(type_query, code=type_code, name=type_name, urn=establishment_node['urn'])
            if result.single():
                relationships_created += 1

        # Establishment Type Group
        type_group_code = row.get('EstablishmentTypeGroup (code)', '').strip()
        type_group_name = row.get('EstablishmentTypeGroup (name)', '').strip()
        if type_group_code and type_group_name and type_group_name != 'Not applicable':
            type_group_query = """
            MERGE (etg:EstablishmentTypeGroup {code: $code, name: $name})
            WITH etg
            MATCH (e:Establishment) WHERE e.urn = $urn
            MERGE (e)-[r:IS_ESTABLISHMENT_TYPE_GROUP]->(etg)
            RETURN r
            """
            result = tx.run(type_group_query, code=type_group_code, name=type_group_name, urn=establishment_node['urn'])
            if result.single():
                relationships_created += 1

        # Establishment Status
        status_code = row.get('EstablishmentStatus (code)', '').strip()
        status_name = row.get('EstablishmentStatus (name)', '').strip()
        if status_code and status_name and status_name != 'Not applicable':
            status_query = """
            MERGE (es:EstablishmentStatus {code: $code, name: $name})
            WITH es
            MATCH (e:Establishment) WHERE e.urn = $urn
            MERGE (e)-[r:CURRENT_ESTABLISHMENT_STATUS]->(es)
            RETURN r
            """
            result = tx.run(status_query, code=status_code, name=status_name, urn=establishment_node['urn'])
            if result.single():
                relationships_created += 1

        # Phase of Education
        phase_code = row.get('PhaseOfEducation (code)', '').strip()
        phase_name = row.get('PhaseOfEducation (name)', '').strip()
        if phase_code and phase_name and phase_name != 'Not applicable':
            phase_query = """
            MERGE (poe:PhaseOfEducation {code: $code, name: $name})
            WITH poe
            MATCH (e:Establishment) WHERE e.urn = $urn
            MERGE (e)-[r:PROVIDES_PHASE_OF_EDUCATION]->(poe)
            RETURN r
            """
            result = tx.run(phase_query, code=phase_code, name=phase_name, urn=establishment_node['urn'])
            if result.single():
                relationships_created += 1

        # Gender Type
        gender_code = row.get('Gender (code)', '').strip()
        gender_name = row.get('Gender (name)', '').strip()
        if gender_code and gender_name and gender_name != 'Not applicable':
            gender_query = """
            MERGE (gt:GenderType {code: $code, name: $name})
            WITH gt
            MATCH (e:Establishment) WHERE e.urn = $urn
            MERGE (e)-[r:PROVIDES_FOR_GENDER_TYPE]->(gt)
            RETURN r
            """
            result = tx.run(gender_query, code=gender_code, name=gender_name, urn=establishment_node['urn'])
            if result.single():
                relationships_created += 1

        # Religious Character
        religious_code = row.get('ReligiousCharacter (code)', '').strip()
        religious_name = row.get('ReligiousCharacter (name)', '').strip()
        if religious_code and religious_name and religious_name != 'Not applicable':
            religious_query = """
            MERGE (rc:ReligiousCharacter {code: $code, name: $name})
            WITH rc
            MATCH (e:Establishment) WHERE e.urn = $urn
            MERGE (e)-[r:INCLUDES_RELIGIOUS_CHARACTER]->(rc)
            RETURN r
            """
            result = tx.run(religious_query, code=religious_code, name=religious_name, urn=establishment_node['urn'])
            if result.single():
                relationships_created += 1

        # Diocese
        diocese_code = row.get('Diocese (code)', '').strip()
        diocese_name = row.get('Diocese (name)', '').strip()
        if diocese_code and diocese_name and diocese_name != 'Not applicable':
            diocese_query = """
            MERGE (d:Diocese {code: $code, name: $name})
            WITH d
            MATCH (e:Establishment) WHERE e.urn = $urn
            MERGE (e)-[r:BELONGS_TO_DIOCESE]->(d)
            RETURN r
            """
            result = tx.run(diocese_query, code=diocese_code, name=diocese_name, urn=establishment_node['urn'])
            if result.single():
                relationships_created += 1

        # Special Classes
        special_classes = row.get('SpecialClasses', '').strip()
        if special_classes and special_classes != 'Not applicable' and special_classes != '0':
            special_query = """
            MERGE (sc:SpecialClasses {name: $name})
            WITH sc
            MATCH (e:Establishment) WHERE e.urn = $urn
            MERGE (e)-[r:HAS_SPECIAL_CLASSES]->(sc)
            RETURN r
            """
            result = tx.run(special_query, name=special_classes, urn=establishment_node['urn'])
            if result.single():
                relationships_created += 1

        # Sixth Form
        sixth_form = row.get('SixthForm', '').strip()
        if sixth_form and sixth_form != 'Not applicable':
            sixth_form_query = """
            MERGE (sf:SixthForm {name: $name})
            WITH sf
            MATCH (e:Establishment) WHERE e.urn = $urn
            MERGE (e)-[r:HAS_SIXTH_FORM]->(sf)
            RETURN r
            """
            result = tx.run(sixth_form_query, name=sixth_form, urn=establishment_node['urn'])
            if result.single():
                relationships_created += 1

        # Geographical hierarchy: Locality -> Town -> County -> Country
        # Country
        country_name = row.get('Country (name)', '').strip()
        if country_name and country_name != 'Not applicable':
            country_query = """
            MERGE (c:Country {name: $name})
            RETURN c
            """
            tx.run(country_query, name=country_name)

        # County
        county_name = row.get('County (name)', '').strip()
        if county_name and county_name != 'Not applicable':
            county_query = """
            MERGE (co:County {name: $name})
            WITH co
            MATCH (c:Country) WHERE c.name = $country_name
            MERGE (co)-[r:IS_IN_COUNTRY]->(c)
            RETURN co
            """
            tx.run(county_query, name=county_name, country_name=country_name)

        # Town
        town_name = row.get('Town', '').strip()
        if town_name and town_name != 'Not applicable':
            town_query = """
            MERGE (t:Town {name: $name})
            WITH t
            MATCH (co:County) WHERE co.name = $county_name
            MERGE (t)-[r:IS_IN_COUNTY]->(co)
            RETURN t
            """
            tx.run(town_query, name=town_name, county_name=county_name)

        # Locality
        locality_name = row.get('Locality', '').strip()
        if locality_name and locality_name != 'Not applicable':
            locality_query = """
            MERGE (l:Locality {name: $name})
            WITH l
            MATCH (t:Town) WHERE t.name = $town_name
            MERGE (l)-[r:IS_IN_TOWN]->(t)
            RETURN l
            """
            tx.run(locality_query, name=locality_name, town_name=town_name)

        # Government Office Region
        gor_name = row.get('GOR (name)', '').strip()
        if gor_name and gor_name != 'Not applicable':
            gor_query = """
            MERGE (gor:GovernmentOfficeRegion {name: $name})
            WITH gor
            MATCH (e:Establishment) WHERE e.urn = $urn
            MERGE (e)-[r:IS_IN_GOVERNMENT_OFFICE_REGION]->(gor)
            RETURN r
            """
            result = tx.run(gor_query, name=gor_name, urn=establishment_node['urn'])
            if result.single():
                relationships_created += 1

        # District Administrative
        district_name = row.get('DistrictAdministrative (name)', '').strip()
        if district_name and district_name != 'Not applicable':
            district_query = """
            MERGE (da:DistrictAdministrative {name: $name})
            WITH da
            MATCH (e:Establishment) WHERE e.urn = $urn
            MERGE (e)-[r:IS_IN_DISTRICT_ADMINISTRATIVE]->(da)
            RETURN r
            """
            result = tx.run(district_query, name=district_name, urn=establishment_node['urn'])
            if result.single():
                relationships_created += 1

        # Establishment location relationships
        if locality_name and locality_name != 'Not applicable':
            location_query = """
            MATCH (e:Establishment) WHERE e.urn = $urn
            MATCH (l:Locality) WHERE l.name = $locality_name
            MERGE (e)-[r:IS_LOCATED_IN_LOCALITY]->(l)
            RETURN r
            """
            result = tx.run(location_query, urn=establishment_node['urn'], locality_name=locality_name)
            if result.single():
                relationships_created += 1

        return relationships_created

    def _process_headers(self, fieldnames: List[str]) -> None:
        """Process CSV headers to understand the data structure"""
        logger.info(f"Processing {len(fieldnames)} columns from Edubase data")

        # Group related columns
        self.column_groups = {
            'establishment': ['URN', 'EstablishmentNumber', 'EstablishmentName'],
            'local_authority': ['LA (code)', 'LA (name)'],
            'establishment_type': ['TypeOfEstablishment (code)', 'TypeOfEstablishment (name)'],
            'establishment_type_group': ['EstablishmentTypeGroup (code)', 'EstablishmentTypeGroup (name)'],
            'establishment_status': ['EstablishmentStatus (code)', 'EstablishmentStatus (name)'],
            'phase_education': ['PhaseOfEducation (code)', 'PhaseOfEducation (name)', 'StatutoryLowAge', 'StatutoryHighAge'],
            'boarders': ['Boarders (code)', 'Boarders (name)'],
            'nursery': ['NurseryProvision (name)'],
            'sixth_form': ['OfficialSixthForm (code)', 'OfficialSixthForm (name)'],
            'gender': ['Gender (code)', 'Gender (name)'],
            'religious': ['ReligiousCharacter (code)', 'ReligiousCharacter (name)', 'ReligiousEthos (name)'],
            'diocese': ['Diocese (code)', 'Diocese (name)'],
            'admissions': ['AdmissionsPolicy (code)', 'AdmissionsPolicy (name)'],
            'capacity': ['SchoolCapacity'],
            'special_classes': ['SpecialClasses (code)', 'SpecialClasses (name)'],
            'census': ['CensusDate', 'NumberOfPupils', 'NumberOfBoys', 'NumberOfGirls', 'PercentageFSM'],
            'trust': ['TrustSchoolFlag (code)', 'TrustSchoolFlag (name)', 'Trusts (code)', 'Trusts (name)'],
            'sponsor': ['SchoolSponsorFlag (name)', 'SchoolSponsors (name)'],
            'federation': ['FederationFlag (name)', 'Federations (code)', 'Federations (name)'],
            'ukprn': ['UKPRN'],
            'fe_type': ['FEHEIdentifier', 'FurtherEducationType (name)'],
            'dates': ['OpenDate', 'CloseDate', 'LastChangedDate'],
            'address': ['Street', 'Locality', 'Address3', 'Town', 'County (name)', 'Postcode'],
            'contact': ['SchoolWebsite', 'TelephoneNum'],
            'head_teacher': ['HeadTitle (name)', 'HeadFirstName', 'HeadLastName', 'HeadPreferredJobTitle'],
            'inspection': ['BSOInspectorateName (name)', 'InspectorateReport', 'DateOfLastInspectionVisit', 'NextInspectionVisit'],
            'special_provision': ['TeenMoth (name)', 'TeenMothPlaces', 'CCF (name)', 'SENPRU (name)', 'EBD (name)', 'PlacesPRU'],
            'ft_provision': ['FTProv (name)', 'EdByOther (name)', 'Section41Approved (name)'],
            'sen_provision': ['SEN1 (name)', 'SEN2 (name)', 'SEN3 (name)', 'SEN4 (name)', 'SEN5 (name)',
                            'SEN6 (name)', 'SEN7 (name)', 'SEN8 (name)', 'SEN9 (name)', 'SEN10 (name)',
                            'SEN11 (name)', 'SEN12 (name)', 'SEN13 (name)'],
            'resourced_provision': ['TypeOfResourcedProvision (name)', 'ResourcedProvisionOnRoll', 'ResourcedProvisionCapacity'],
            'sen_unit': ['SenUnitOnRoll', 'SenUnitCapacity'],
            'geography': ['GOR (code)', 'GOR (name)', 'DistrictAdministrative (code)', 'DistrictAdministrative (name)',
                         'AdministrativeWard (code)', 'AdministrativeWard (name)', 'ParliamentaryConstituency (code)',
                         'ParliamentaryConstituency (name)', 'UrbanRural (code)', 'UrbanRural (name)'],
            'gss_codes': ['GSSLACode (name)', 'Easting', 'Northing', 'MSOA (name)', 'LSOA (name)'],
            'inspection_details': ['InspectorateName (name)', 'SENStat', 'SENNoStat'],
            'boarding': ['BoardingEstablishment (name)'],
            'props': ['PropsName'],
            'previous': ['PreviousLA (code)', 'PreviousLA (name)', 'PreviousEstablishmentNumber'],
            'country': ['Country (name)'],
            'uprn': ['UPRN'],
            'site': ['SiteName'],
            'qab': ['QABName (code)', 'QABName (name)', 'EstablishmentAccredited (code)', 'EstablishmentAccredited (name)',
                   'QABReport', 'AccreditationExpiryDate'],
            'ch_number': ['CHNumber'],
            'msoa_lsoa_codes': ['MSOA (code)', 'LSOA (code)'],
            'fsm': ['FSM']
        }

    def _process_edubase_row(self, row: Dict[str, str]) -> Tuple[List[Tuple[str, Dict[str, Any]]], List[Tuple[str, str, str]]]:
        """Process a single Edubase data row and return a tuple of (nodes, relationships)"""
        nodes = []
        relationships = []

        try:
            # Create main establishment node
            establishment_props = self._extract_establishment_properties(row)
            if establishment_props:
                nodes.append(('Establishment', establishment_props))

            # Create related nodes and relationships
            self._create_related_nodes_and_relationships(row, nodes, relationships)

        except Exception as e:
            logger.error(f"Error processing row {row.get('URN', 'unknown')}: {str(e)}")

        return nodes, relationships

    def _extract_establishment_properties(self, row: Dict[str, str]) -> Optional[Dict[str, Any]]:
        """Extract properties for the main establishment node"""
        urn = row.get('URN', '').strip()
        if not urn:
            return None

        props = {
            'urn': urn,
            'establishmentNumber': row.get('EstablishmentNumber', '').strip(),
            'establishmentName': row.get('EstablishmentName', '').strip(),
            'openDate': self._parse_date(row.get('OpenDate', '')),
            'closeDate': self._parse_date(row.get('CloseDate', '')),
            'lastChangedDate': self._parse_date(row.get('LastChangedDate', '')),
            'schoolCapacity': self._parse_int(row.get('SchoolCapacity', '')),
            'numberOfPupils': self._parse_int(row.get('NumberOfPupils', '')),
            'numberOfBoys': self._parse_int(row.get('NumberOfBoys', '')),
            'numberOfGirls': self._parse_int(row.get('NumberOfGirls', '')),
            'percentageFSM': self._parse_float(row.get('PercentageFSM', '')),
            'statutoryLowAge': self._parse_int(row.get('StatutoryLowAge', '')),
            'statutoryHighAge': self._parse_int(row.get('StatutoryHighAge', '')),
            'easting': self._parse_int(row.get('Easting', '')),
            'northing': self._parse_int(row.get('Northing', '')),
            'street': row.get('Street', '').strip(),
            'locality': row.get('Locality', '').strip(),
            'address3': row.get('Address3', '').strip(),
            'town': row.get('Town', '').strip(),
            'county': row.get('County (name)', '').strip(),
            'postcode': row.get('Postcode', '').strip(),
            'schoolWebsite': row.get('SchoolWebsite', '').strip(),
            'telephoneNum': row.get('TelephoneNum', '').strip(),
            'headTitle': row.get('HeadTitle (name)', '').strip(),
            'headFirstName': row.get('HeadFirstName', '').strip(),
            'headLastName': row.get('HeadLastName', '').strip(),
            'headPreferredJobTitle': row.get('HeadPreferredJobTitle', '').strip(),
            'censusDate': self._parse_date(row.get('CensusDate', '')),
            'teenMothPlaces': self._parse_int(row.get('TeenMothPlaces', '')),
            'placesPRU': self._parse_int(row.get('PlacesPRU', '')),
            'resourcedProvisionOnRoll': self._parse_int(row.get('ResourcedProvisionOnRoll', '')),
            'resourcedProvisionCapacity': self._parse_int(row.get('ResourcedProvisionCapacity', '')),
            'senUnitOnRoll': self._parse_int(row.get('SenUnitOnRoll', '')),
            'senUnitCapacity': self._parse_int(row.get('SenUnitCapacity', '')),
            'fsm': self._parse_int(row.get('FSM', '')),
            'ukprn': row.get('UKPRN', '').strip(),
            'uprn': row.get('UPRN', '').strip(),
            'chNumber': row.get('CHNumber', '').strip()
        }

        # Remove empty/None values
        props = {k: v for k, v in props.items() if v is not None and v != '' and v != 'Not applicable'}

        return props

    def _create_related_nodes_and_relationships(self, row: Dict[str, str], nodes: List[Tuple[str, Dict[str, Any]]], relationships: List[Tuple[str, str, str]]) -> None:
        """Create related nodes and relationships for an establishment"""
        urn = row.get('URN', '').strip()
        if not urn:
            return

        # Local Authority
        la_code = row.get('LA (code)', '').strip()
        la_name = row.get('LA (name)', '').strip()
        if la_code and la_name and la_name != 'Not applicable':
            la_key = f"{la_code}_{la_name}"
            if la_key not in self.created_nodes['LocalAuthority']:
                nodes.append(('LocalAuthority', {'code': la_code, 'name': la_name}))
                self.created_nodes['LocalAuthority'].add(la_key)
            relationships.append(('IS_CONTROLLED_BY_LOCAL_AUTHORITY', urn, la_key))

        # Establishment Type
        type_code = row.get('TypeOfEstablishment (code)', '').strip()
        type_name = row.get('TypeOfEstablishment (name)', '').strip()
        if type_code and type_name and type_name != 'Not applicable':
            type_key = f"{type_code}_{type_name}"
            if type_key not in self.created_nodes['EstablishmentType']:
                nodes.append(('EstablishmentType', {'code': type_code, 'name': type_name}))
                self.created_nodes['EstablishmentType'].add(type_key)
            relationships.append(('IS_ESTABLISHMENT_TYPE', urn, type_key))

        # Establishment Type Group
        group_code = row.get('EstablishmentTypeGroup (code)', '').strip()
        group_name = row.get('EstablishmentTypeGroup (name)', '').strip()
        if group_code and group_name and group_name != 'Not applicable':
            group_key = f"{group_code}_{group_name}"
            if group_key not in self.created_nodes['EstablishmentTypeGroup']:
                nodes.append(('EstablishmentTypeGroup', {'code': group_code, 'name': group_name}))
                self.created_nodes['EstablishmentTypeGroup'].add(group_key)
            relationships.append(('IS_ESTABLISHMENT_TYPE_GROUP', urn, group_key))

        # Establishment Status
        status_code = row.get('EstablishmentStatus (code)', '').strip()
        status_name = row.get('EstablishmentStatus (name)', '').strip()
        if status_code and status_name and status_name != 'Not applicable':
            status_key = f"{status_code}_{status_name}"
            if status_key not in self.created_nodes['EstablishmentStatus']:
                nodes.append(('EstablishmentStatus', {'code': status_code, 'name': status_name}))
                self.created_nodes['EstablishmentStatus'].add(status_key)
            relationships.append(('CURRENT_ESTABLISHMENT_STATUS', urn, status_key))

        # Phase of Education
        phase_code = row.get('PhaseOfEducation (code)', '').strip()
        phase_name = row.get('PhaseOfEducation (name)', '').strip()
        if phase_code and phase_name and phase_name != 'Not applicable':
            phase_key = f"{phase_code}_{phase_name}"
            if phase_key not in self.created_nodes['PhaseOfEducation']:
                nodes.append(('PhaseOfEducation', {'code': phase_code, 'name': phase_name}))
                self.created_nodes['PhaseOfEducation'].add(phase_key)
            relationships.append(('PROVIDES_PHASE_OF_EDUCATION', urn, phase_key))

        # Gender
        gender_code = row.get('Gender (code)', '').strip()
        gender_name = row.get('Gender (name)', '').strip()
        if gender_code and gender_name and gender_name != 'Not applicable':
            gender_key = f"{gender_code}_{gender_name}"
            if gender_key not in self.created_nodes['GenderType']:
                nodes.append(('GenderType', {'code': gender_code, 'name': gender_name}))
                self.created_nodes['GenderType'].add(gender_key)
            relationships.append(('PROVIDES_FOR_GENDER_TYPE', urn, gender_key))

        # Religious Character
        rel_code = row.get('ReligiousCharacter (code)', '').strip()
        rel_name = row.get('ReligiousCharacter (name)', '').strip()
        if rel_code and rel_name and rel_name != 'Not applicable':
            rel_key = f"{rel_code}_{rel_name}"
            if rel_key not in self.created_nodes['ReligiousCharacter']:
                nodes.append(('ReligiousCharacter', {'code': rel_code, 'name': rel_name}))
                self.created_nodes['ReligiousCharacter'].add(rel_key)
            relationships.append(('INCLUDES_RELIGIOUS_CHARACTER', urn, rel_key))

        # Diocese
        diocese_code = row.get('Diocese (code)', '').strip()
        diocese_name = row.get('Diocese (name)', '').strip()
        if diocese_code and diocese_name and diocese_name != 'Not applicable':
            diocese_key = f"{diocese_code}_{diocese_name}"
            if diocese_key not in self.created_nodes['Diocese']:
                nodes.append(('Diocese', {'code': diocese_code, 'name': diocese_name}))
                self.created_nodes['Diocese'].add(diocese_key)
            relationships.append(('UNDER_DIOCESE', urn, diocese_key))

        # Government Office Region
        gor_code = row.get('GOR (code)', '').strip()
        gor_name = row.get('GOR (name)', '').strip()
        if gor_code and gor_name and gor_name != 'Not applicable':
            gor_key = f"{gor_code}_{gor_name}"
            if gor_key not in self.created_nodes['GovernmentOfficeRegion']:
                nodes.append(('GovernmentOfficeRegion', {'code': gor_code, 'name': gor_name}))
                self.created_nodes['GovernmentOfficeRegion'].add(gor_key)
            relationships.append(('OVERSEEN_BY_GOVERNMENT_OFFICE_REGION', urn, gor_key))

        # District Administrative
        district_code = row.get('DistrictAdministrative (code)', '').strip()
        district_name = row.get('DistrictAdministrative (name)', '').strip()
        if district_code and district_name and district_name != 'Not applicable':
            district_key = f"{district_code}_{district_name}"
            if district_key not in self.created_nodes['DistrictAdministrative']:
                nodes.append(('DistrictAdministrative', {'code': district_code, 'name': district_name}))
                self.created_nodes['DistrictAdministrative'].add(district_key)
            relationships.append(('WITHIN_DISTRICT_ADMINISTRATIVE', urn, district_key))

        # Country
        country_name = row.get('Country (name)', '').strip()
        if country_name and country_name != 'Not applicable':
            if country_name not in self.created_nodes['Country']:
                nodes.append(('Country', {'name': country_name}))
                self.created_nodes['Country'].add(country_name)
            relationships.append(('LOCATED_IN_COUNTRY', urn, country_name))

        # County
        county_name = row.get('County (name)', '').strip()
        if county_name and county_name != 'Not applicable':
            if county_name not in self.created_nodes['County']:
                nodes.append(('County', {'name': county_name}))
                self.created_nodes['County'].add(county_name)
            relationships.append(('LOCATED_IN_COUNTY', urn, county_name))

            # County is in Country
            if country_name and country_name != 'Not applicable':
                relationships.append(('PART_OF_COUNTRY', county_name, country_name))

        # Town
        town_name = row.get('Town', '').strip()
        if town_name and town_name != 'Not applicable':
            if town_name not in self.created_nodes['Town']:
                nodes.append(('Town', {'name': town_name}))
                self.created_nodes['Town'].add(town_name)
            relationships.append(('LOCATED_IN_TOWN', urn, town_name))

            # Town is in County
            if county_name and county_name != 'Not applicable':
                relationships.append(('PART_OF_COUNTY', town_name, county_name))

        # Locality
        locality_name = row.get('Locality', '').strip()
        if locality_name and locality_name != 'Not applicable':
            if locality_name not in self.created_nodes['Locality']:
                nodes.append(('Locality', {'name': locality_name}))
                self.created_nodes['Locality'].add(locality_name)
            relationships.append(('LOCATED_IN_LOCALITY', urn, locality_name))

            # Locality is in Town
            if town_name and town_name != 'Not applicable':
                relationships.append(('PART_OF_TOWN', locality_name, town_name))

        # Special Classes
        special_classes_code = row.get('SpecialClasses (code)', '').strip()
        special_classes_name = row.get('SpecialClasses (name)', '').strip()
        if special_classes_code and special_classes_name and special_classes_name != 'Not applicable':
            special_classes_key = f"{special_classes_code}_{special_classes_name}"
            if special_classes_key not in self.created_nodes['SpecialClasses']:
                nodes.append(('SpecialClasses', {'code': special_classes_code, 'name': special_classes_name}))
                self.created_nodes['SpecialClasses'].add(special_classes_key)
            relationships.append(('PROVIDES_SPECIAL_CLASSES', urn, special_classes_key))

        # Further Education Type
        fe_type_name = row.get('FurtherEducationType (name)', '').strip()
        if fe_type_name and fe_type_name != 'Not applicable':
            if fe_type_name not in self.created_nodes['FurtherEducationType']:
                nodes.append(('FurtherEducationType', {'name': fe_type_name}))
                self.created_nodes['FurtherEducationType'].add(fe_type_name)
            relationships.append(('PROVIDES_FURTHER_EDUCATION_TYPE', urn, fe_type_name))

        # Sixth Form
        sixth_form_code = row.get('OfficialSixthForm (code)', '').strip()
        sixth_form_name = row.get('OfficialSixthForm (name)', '').strip()
        if sixth_form_code and sixth_form_name and sixth_form_name != 'Not applicable':
            sixth_form_key = f"{sixth_form_code}_{sixth_form_name}"
            if sixth_form_key not in self.created_nodes['SixthForm']:
                nodes.append(('SixthForm', {'code': sixth_form_code, 'name': sixth_form_name}))
                self.created_nodes['SixthForm'].add(sixth_form_key)
            relationships.append(('PROVIDES_SIXTH_FORM', urn, sixth_form_key))

    def _create_all_nodes(self, all_nodes: List[Tuple[str, Dict[str, Any]]]) -> Dict[str, Any]:
        """Create all nodes from the collected list of (label, properties) tuples"""
        node_map = {}
        try:
            with self.neo4j_service.driver.session(database=self.db_name) as session:
                for label, properties in all_nodes:
                    try:
                        if label == 'Establishment':
                            # For establishments, use URN as key
                            key = properties.get('urn')
                            if key:
                                node = create_node_direct(session, label, properties)
                                if node:
                                    node_map[key] = node
                                    logger.debug(f"Created {label} node with key: {key}")
                        else:
                            # For other nodes, create and store for later relationship creation
                            node = create_node_direct(session, label, properties)
                            if node:
                                # Create a key for this node
                                if 'code' in properties and 'name' in properties:
                                    key = f"{properties['code']}_{properties['name']}"
                                elif 'name' in properties:
                                    key = properties['name']
                                else:
                                    key = str(node.id)
                                node_map[key] = node
                                logger.debug(f"Created {label} node with key: {key}")
                    except Exception as e:
                        logger.error(f"Failed to create {label} node: {str(e)}")
                return node_map
        except Exception as e:
            logger.error(f"Error creating all nodes: {str(e)}")
            return {}

    def _create_all_relationships(self, all_relationships: List[Tuple[str, str, str]], node_map: Dict[str, Any]) -> int:
        """Create all relationships from the collected list of (rel_type, start_key, end_key) tuples"""
        relationships_created = 0
        try:
            with self.neo4j_service.driver.session(database=self.db_name) as session:
                for rel_type, start_key, end_key in all_relationships:
                    start_node = node_map.get(start_key)
                    end_node = node_map.get(end_key)

                    if start_node and end_node:
                        try:
                            # Use property-based matching instead of deprecated ID() function
                            # This is the recommended approach for Neo4j 5.x

                            # Get properties from the nodes
                            start_props = dict(start_node)
                            end_props = dict(end_node)

                            # Get labels
                            start_label = list(start_node.labels)[0] if start_node.labels else 'Node'
                            end_label = list(end_node.labels)[0] if end_node.labels else 'Node'

                            # Create a unique property-based query
                            # For establishments, use URN (unique identifier)
                            if start_label == 'Establishment' and 'urn' in start_props:
                                start_match = f"n1:{start_label} {{urn: $start_urn}}"
                                start_params = {'start_urn': start_props['urn']}
                            else:
                                # For other nodes, use the combined code_name key format
                                # This matches how we create the nodes in the first place
                                if 'code' in start_props and 'name' in start_props:
                                    start_match = f"n1:{start_label} {{code: $start_code, name: $start_name}}"
                                    start_params = {'start_code': start_props['code'], 'start_name': start_props['name']}
                                elif 'name' in start_props:
                                    start_match = f"n1:{start_label} {{name: $start_name}}"
                                    start_params = {'start_name': start_props['name']}
                                else:
                                    print(f"ERROR: No unique property found for start node {start_label}: {start_props}")
                                    continue

                            # Same for end node
                            if end_label == 'Establishment' and 'urn' in end_props:
                                end_match = f"n2:{end_label} {{urn: $end_urn}}"
                                end_params = {'end_urn': end_props['urn']}
                            else:
                                # For other nodes, use the combined code_name key format
                                if 'code' in end_props and 'name' in end_props:
                                    end_match = f"n2:{end_label} {{code: $end_code, name: $end_name}}"
                                    end_params = {'end_code': end_props['code'], 'end_name': end_props['name']}
                                elif 'name' in end_props:
                                    end_match = f"n2:{end_label} {{name: $end_name}}"
                                    end_params = {'end_name': end_props['name']}
                                else:
                                    print(f"ERROR: No unique property found for end node {end_label}: {end_props}")
                                    continue

                            # Combine parameters
                            params = {**start_params, **end_params}

                            # Create relationship using property-based matching
                            query = f"""
                            MATCH ({start_match}), ({end_match})
                            MERGE (n1)-[r:{rel_type}]->(n2)
                            RETURN r
                            """

                            result = session.run(query, **params)
                            record = result.single()

                            if record and record["r"]:
                                self.created_relationships.add(f"{start_key}-{rel_type}-{end_key}")
                                relationships_created += 1
                                print(f"SUCCESS: Created relationship {rel_type} between {start_key} and {end_key}")
                            else:
                                print(f"FAILED: Could not create relationship {rel_type} between {start_key} and {end_key}")

                        except Exception as e:
                            print(f"ERROR: Exception creating relationship {rel_type} between {start_key} and {end_key}: {str(e)}")
                    else:
                        if not start_node:
                            logger.warning(f"Start node not found for relationship {rel_type} with key {start_key}")
                        if not end_node:
                            logger.warning(f"End node not found for relationship {rel_type} with key {end_key}")
            return relationships_created
        except Exception as e:
            logger.error(f"Error creating all relationships: {str(e)}")
            return 0

    def _parse_date(self, date_str: str) -> Optional[str]:
        """Parse date string to ISO format"""
        if not date_str or date_str.strip() == '' or date_str.strip() == 'Not applicable':
            return None

        try:
            # Handle DD-MM-YYYY format
            if '-' in date_str:
                parts = date_str.split('-')
                if len(parts) == 3:
                    day, month, year = parts
                    if len(year) == 2:
                        year = f"20{year}" if int(year) < 50 else f"19{year}"
                    return f"{year}-{month.zfill(2)}-{day.zfill(2)}"
        except:
            pass

        return date_str.strip()

    def _parse_int(self, int_str: str) -> Optional[int]:
        """Parse integer string"""
        if not int_str or int_str.strip() == '' or int_str.strip() == 'Not applicable':
            return None

        try:
            return int(int_str.strip())
        except:
            return None

    def _parse_float(self, float_str: str) -> Optional[float]:
        """Parse float string"""
        if not float_str or float_str.strip() == '' or float_str.strip() == 'Not applicable':
            return None

        try:
            return float(float_str.strip())
        except:
            return None

def import_gais_data() -> Dict[str, Any]:
    """Import GAIS data into Neo4j database"""
    logger.info("Starting GAIS data import...")

    try:
        importer = GAISDataImporter()
        # Process all rows in the CSV file
        result = importer.import_edubase_data_simple(test_mode=False)

        if result["success"]:
            logger.info("GAIS data import completed successfully!")
        else:
            logger.error(f"GAIS data import failed: {result['message']}")

        return result

    except Exception as e:
        logger.error(f"Error in GAIS data import: {str(e)}")
        return {
            "success": False,
            "message": f"Error in GAIS data import: {str(e)}"
        }