1043 lines
51 KiB
Python
1043 lines
51 KiB
Python
"""
|
|
GAIS Data Import Module for ClassroomCopilot
|
|
Handles import of publicly available school databases into Neo4j
|
|
Starting with Edubase All Data
|
|
"""
|
|
import os
|
|
import csv
|
|
import time
|
|
from typing import Dict, Any, List, Optional, Set, Tuple
|
|
from datetime import datetime
|
|
from modules.logger_tool import initialise_logger
|
|
from modules.database.services.neo4j_service import Neo4jService
|
|
from modules.database.tools.neo4j_session_tools import create_relationship
|
|
|
|
logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True)
|
|
|
|
def create_node_direct(session, label, properties):
|
|
"""Create a node and return it directly without using transaction ID lookup"""
|
|
try:
|
|
query = f"""
|
|
CREATE (n:{label} $properties)
|
|
RETURN n
|
|
"""
|
|
result = session.run(query, properties=properties)
|
|
record = result.single()
|
|
if record:
|
|
return record["n"]
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Error creating {label} node: {str(e)}")
|
|
return None
|
|
|
|
class GAISDataImporter:
|
|
"""Handles import of publicly available school databases into Neo4j"""
|
|
|
|
def __init__(self):
|
|
self.neo4j_service = Neo4jService()
|
|
self.db_name = os.getenv("NEO4J_GAIS_DATA", "gaisdata") # Use dedicated database for GAIS data
|
|
self.import_dir = os.path.join(os.path.dirname(__file__), "import")
|
|
|
|
# Track created nodes to avoid duplicates
|
|
self.created_nodes = {
|
|
'LocalAuthority': set(),
|
|
'EstablishmentType': set(),
|
|
'EstablishmentTypeGroup': set(),
|
|
'EstablishmentStatus': set(),
|
|
'PhaseOfEducation': set(),
|
|
'BoarderType': set(),
|
|
'GenderType': set(),
|
|
'ReligiousCharacter': set(),
|
|
'Diocese': set(),
|
|
'AdmissionsPolicy': set(),
|
|
'SpecialClasses': set(),
|
|
'TrustSchoolFlag': set(),
|
|
'FederationFlag': set(),
|
|
'Country': set(),
|
|
'County': set(),
|
|
'Town': set(),
|
|
'Locality': set(),
|
|
'GovernmentOfficeRegion': set(),
|
|
'DistrictAdministrative': set(),
|
|
'AdministrativeWard': set(),
|
|
'ParliamentaryConstituency': set(),
|
|
'UrbanRural': set(),
|
|
'Inspectorate': set(),
|
|
'QAB': set(),
|
|
'FurtherEducationType': set(),
|
|
'SixthForm': set()
|
|
}
|
|
|
|
# Track created relationships
|
|
self.created_relationships = set()
|
|
|
|
# Batch processing
|
|
self.batch_size = 1000
|
|
self.current_batch = []
|
|
|
|
# Ensure the GAIS database exists
|
|
self._ensure_database_exists()
|
|
|
|
def _ensure_database_exists(self) -> None:
|
|
"""Ensure the GAIS database exists, create if it doesn't"""
|
|
try:
|
|
# Check if database exists
|
|
with self.neo4j_service.driver.session() as session:
|
|
result = session.run("SHOW DATABASES")
|
|
databases = [record["name"] for record in result]
|
|
|
|
if self.db_name not in databases:
|
|
logger.info(f"Creating database '{self.db_name}' for GAIS data...")
|
|
# Create the database
|
|
session.run(f"CREATE DATABASE {self.db_name}")
|
|
logger.info(f"Database '{self.db_name}' created successfully")
|
|
else:
|
|
logger.info(f"Database '{self.db_name}' already exists")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error ensuring database exists: {str(e)}")
|
|
raise
|
|
|
|
def import_edubase_data(self, test_mode=False) -> Dict[str, Any]:
|
|
"""Import Edubase All Data into Neo4j"""
|
|
logger.info("Starting Edubase data import...")
|
|
|
|
edubase_file = os.path.join(self.import_dir, "edubasealldata20250828.csv")
|
|
|
|
if not os.path.exists(edubase_file):
|
|
return {
|
|
"success": False,
|
|
"message": f"Edubase file not found: {edubase_file}"
|
|
}
|
|
|
|
try:
|
|
start_time = time.time()
|
|
|
|
# Try different encodings to handle potential encoding issues
|
|
encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
|
|
working_encoding = None
|
|
|
|
for encoding in encodings_to_try:
|
|
try:
|
|
logger.info(f"Trying to read CSV with {encoding} encoding...")
|
|
with open(edubase_file, 'r', encoding=encoding) as file:
|
|
csv_reader = csv.DictReader(file)
|
|
# Test reading the first row to verify encoding works
|
|
first_row = next(csv_reader)
|
|
logger.info(f"Successfully read CSV with {encoding} encoding")
|
|
working_encoding = encoding
|
|
break
|
|
except UnicodeDecodeError as e:
|
|
logger.warning(f"Failed to read with {encoding} encoding: {str(e)}")
|
|
continue
|
|
except Exception as e:
|
|
logger.warning(f"Unexpected error with {encoding} encoding: {str(e)}")
|
|
continue
|
|
|
|
if working_encoding is None:
|
|
return {
|
|
"success": False,
|
|
"message": "Failed to read CSV file with any supported encoding"
|
|
}
|
|
|
|
# Now read the file with the working encoding and collect all data
|
|
all_nodes = [] # List of (label, properties) tuples
|
|
all_relationships = [] # List of (rel_type, start_key, end_key) tuples
|
|
|
|
# Track unique relationships to avoid duplicates during collection
|
|
unique_relationships = set()
|
|
|
|
with open(edubase_file, 'r', encoding=working_encoding) as file:
|
|
csv_reader = csv.DictReader(file)
|
|
|
|
# Process headers and create schema
|
|
self._process_headers(csv_reader.fieldnames)
|
|
|
|
# Process data rows and collect all nodes and relationships
|
|
total_rows = 0
|
|
for row in csv_reader:
|
|
nodes, relationships = self._process_edubase_row(row)
|
|
all_nodes.extend(nodes)
|
|
|
|
# Only add relationships that haven't been seen before
|
|
for rel in relationships:
|
|
if rel not in unique_relationships:
|
|
all_relationships.append(rel)
|
|
unique_relationships.add(rel)
|
|
|
|
total_rows += 1
|
|
|
|
# In test mode, only process first 100 rows
|
|
if test_mode and total_rows >= 100:
|
|
break
|
|
|
|
if total_rows % 1000 == 0:
|
|
logger.info(f"Collected data from {total_rows} rows...")
|
|
|
|
logger.info(f"Collected {len(all_nodes)} nodes and {len(all_relationships)} unique relationships from {total_rows} rows")
|
|
|
|
# Now create all nodes first
|
|
logger.info("Creating all nodes...")
|
|
node_map = self._create_all_nodes(all_nodes)
|
|
|
|
# Then create all relationships
|
|
logger.info("Creating all relationships...")
|
|
relationships_created = self._create_all_relationships(all_relationships, node_map)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error importing Edubase data: {str(e)}")
|
|
return {
|
|
"success": False,
|
|
"message": f"Error importing Edubase data: {str(e)}"
|
|
}
|
|
|
|
end_time = time.time()
|
|
processing_time = end_time - start_time
|
|
|
|
logger.info(f"Edubase data import completed successfully!")
|
|
logger.info(f"Total rows processed: {total_rows}")
|
|
logger.info(f"Processing time: {processing_time:.2f} seconds")
|
|
|
|
return {
|
|
"success": True,
|
|
"message": f"Successfully imported {total_rows} Edubase records",
|
|
"total_rows": total_rows,
|
|
"processing_time": processing_time,
|
|
"nodes_created": {k: len(v) for k, v in self.created_nodes.items()},
|
|
"relationships_created": relationships_created
|
|
}
|
|
|
|
def import_edubase_data_simple(self, test_mode=False) -> Dict[str, Any]:
|
|
"""Simple import approach - create relationships immediately when nodes are created"""
|
|
logger.info("Starting simple Edubase data import...")
|
|
|
|
edubase_file = os.path.join(self.import_dir, "edubasealldata20250828.csv")
|
|
|
|
if not os.path.exists(edubase_file):
|
|
return {
|
|
"success": False,
|
|
"message": f"Edubase file not found: {edubase_file}"
|
|
}
|
|
|
|
try:
|
|
start_time = time.time()
|
|
|
|
# Try different encodings to handle potential encoding issues
|
|
encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
|
|
working_encoding = None
|
|
|
|
for encoding in encodings_to_try:
|
|
try:
|
|
with open(edubase_file, 'r', encoding=encoding) as file:
|
|
csv_reader = csv.DictReader(file)
|
|
# Just read the first row to test
|
|
next(csv_reader)
|
|
working_encoding = encoding
|
|
break
|
|
except UnicodeDecodeError:
|
|
continue
|
|
|
|
if not working_encoding:
|
|
return {
|
|
"success": False,
|
|
"message": "Could not determine file encoding"
|
|
}
|
|
|
|
logger.info(f"Using encoding: {working_encoding}")
|
|
|
|
# Process the CSV file
|
|
relationships_created = 0
|
|
nodes_created = {
|
|
'LocalAuthority': 0, 'EstablishmentType': 0, 'EstablishmentTypeGroup': 0,
|
|
'EstablishmentStatus': 0, 'PhaseOfEducation': 0, 'GenderType': 0,
|
|
'ReligiousCharacter': 0, 'Diocese': 0, 'Country': 0, 'County': 0,
|
|
'Town': 0, 'Locality': 0, 'GovernmentOfficeRegion': 0,
|
|
'DistrictAdministrative': 0, 'SpecialClasses': 0, 'SixthForm': 0
|
|
}
|
|
|
|
with open(edubase_file, 'r', encoding=working_encoding) as file:
|
|
csv_reader = csv.DictReader(file)
|
|
|
|
total_rows = 0
|
|
for row in csv_reader:
|
|
# Create establishment and related nodes with relationships in same transaction
|
|
with self.neo4j_service.driver.session(database=self.db_name) as session:
|
|
with session.begin_transaction() as tx:
|
|
# Create establishment node
|
|
establishment_props = self._extract_establishment_properties(row)
|
|
if not establishment_props:
|
|
continue
|
|
|
|
# Create establishment
|
|
create_est_query = """
|
|
CREATE (e:Establishment $props)
|
|
RETURN e
|
|
"""
|
|
est_result = tx.run(create_est_query, props=establishment_props)
|
|
establishment_node = est_result.single()["e"]
|
|
|
|
# Create related nodes and relationships immediately
|
|
rel_count = self._create_related_nodes_immediate(tx, establishment_node, row)
|
|
relationships_created += rel_count
|
|
|
|
total_rows += 1
|
|
if test_mode and total_rows >= 100:
|
|
break
|
|
|
|
if total_rows % 1000 == 0:
|
|
logger.info(f"Processed {total_rows} rows...")
|
|
|
|
processing_time = time.time() - start_time
|
|
|
|
logger.info(f"Simple Edubase data import completed successfully!")
|
|
logger.info(f"Total rows processed: {total_rows}")
|
|
logger.info(f"Processing time: {processing_time:.2f} seconds")
|
|
|
|
return {
|
|
"success": True,
|
|
"message": f"Successfully imported {total_rows} Edubase records",
|
|
"total_rows": total_rows,
|
|
"processing_time": processing_time,
|
|
"nodes_created": nodes_created,
|
|
"relationships_created": relationships_created
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in simple Edubase data import: {str(e)}")
|
|
return {
|
|
"success": False,
|
|
"message": f"Error in simple Edubase data import: {str(e)}"
|
|
}
|
|
|
|
def _create_related_nodes_immediate(self, tx, establishment_node, row):
|
|
"""Create related nodes and relationships immediately in the same transaction"""
|
|
relationships_created = 0
|
|
|
|
# Local Authority
|
|
la_code = row.get('LA (code)', '').strip()
|
|
la_name = row.get('LA (name)', '').strip()
|
|
if la_code and la_name and la_name != 'Not applicable':
|
|
# Create or find local authority
|
|
la_query = """
|
|
MERGE (la:LocalAuthority {code: $code, name: $name})
|
|
WITH la
|
|
MATCH (e:Establishment) WHERE e.urn = $urn
|
|
MERGE (e)-[r:IS_CONTROLLED_BY_LOCAL_AUTHORITY]->(la)
|
|
RETURN r
|
|
"""
|
|
result = tx.run(la_query, code=la_code, name=la_name, urn=establishment_node['urn'])
|
|
if result.single():
|
|
relationships_created += 1
|
|
|
|
# Establishment Type
|
|
type_code = row.get('TypeOfEstablishment (code)', '').strip()
|
|
type_name = row.get('TypeOfEstablishment (name)', '').strip()
|
|
if type_code and type_name and type_name != 'Not applicable':
|
|
type_query = """
|
|
MERGE (et:EstablishmentType {code: $code, name: $name})
|
|
WITH et
|
|
MATCH (e:Establishment) WHERE e.urn = $urn
|
|
MERGE (e)-[r:IS_ESTABLISHMENT_TYPE]->(et)
|
|
RETURN r
|
|
"""
|
|
result = tx.run(type_query, code=type_code, name=type_name, urn=establishment_node['urn'])
|
|
if result.single():
|
|
relationships_created += 1
|
|
|
|
# Establishment Type Group
|
|
type_group_code = row.get('EstablishmentTypeGroup (code)', '').strip()
|
|
type_group_name = row.get('EstablishmentTypeGroup (name)', '').strip()
|
|
if type_group_code and type_group_name and type_group_name != 'Not applicable':
|
|
type_group_query = """
|
|
MERGE (etg:EstablishmentTypeGroup {code: $code, name: $name})
|
|
WITH etg
|
|
MATCH (e:Establishment) WHERE e.urn = $urn
|
|
MERGE (e)-[r:IS_ESTABLISHMENT_TYPE_GROUP]->(etg)
|
|
RETURN r
|
|
"""
|
|
result = tx.run(type_group_query, code=type_group_code, name=type_group_name, urn=establishment_node['urn'])
|
|
if result.single():
|
|
relationships_created += 1
|
|
|
|
# Establishment Status
|
|
status_code = row.get('EstablishmentStatus (code)', '').strip()
|
|
status_name = row.get('EstablishmentStatus (name)', '').strip()
|
|
if status_code and status_name and status_name != 'Not applicable':
|
|
status_query = """
|
|
MERGE (es:EstablishmentStatus {code: $code, name: $name})
|
|
WITH es
|
|
MATCH (e:Establishment) WHERE e.urn = $urn
|
|
MERGE (e)-[r:CURRENT_ESTABLISHMENT_STATUS]->(es)
|
|
RETURN r
|
|
"""
|
|
result = tx.run(status_query, code=status_code, name=status_name, urn=establishment_node['urn'])
|
|
if result.single():
|
|
relationships_created += 1
|
|
|
|
# Phase of Education
|
|
phase_code = row.get('PhaseOfEducation (code)', '').strip()
|
|
phase_name = row.get('PhaseOfEducation (name)', '').strip()
|
|
if phase_code and phase_name and phase_name != 'Not applicable':
|
|
phase_query = """
|
|
MERGE (poe:PhaseOfEducation {code: $code, name: $name})
|
|
WITH poe
|
|
MATCH (e:Establishment) WHERE e.urn = $urn
|
|
MERGE (e)-[r:PROVIDES_PHASE_OF_EDUCATION]->(poe)
|
|
RETURN r
|
|
"""
|
|
result = tx.run(phase_query, code=phase_code, name=phase_name, urn=establishment_node['urn'])
|
|
if result.single():
|
|
relationships_created += 1
|
|
|
|
# Gender Type
|
|
gender_code = row.get('Gender (code)', '').strip()
|
|
gender_name = row.get('Gender (name)', '').strip()
|
|
if gender_code and gender_name and gender_name != 'Not applicable':
|
|
gender_query = """
|
|
MERGE (gt:GenderType {code: $code, name: $name})
|
|
WITH gt
|
|
MATCH (e:Establishment) WHERE e.urn = $urn
|
|
MERGE (e)-[r:PROVIDES_FOR_GENDER_TYPE]->(gt)
|
|
RETURN r
|
|
"""
|
|
result = tx.run(gender_query, code=gender_code, name=gender_name, urn=establishment_node['urn'])
|
|
if result.single():
|
|
relationships_created += 1
|
|
|
|
# Religious Character
|
|
religious_code = row.get('ReligiousCharacter (code)', '').strip()
|
|
religious_name = row.get('ReligiousCharacter (name)', '').strip()
|
|
if religious_code and religious_name and religious_name != 'Not applicable':
|
|
religious_query = """
|
|
MERGE (rc:ReligiousCharacter {code: $code, name: $name})
|
|
WITH rc
|
|
MATCH (e:Establishment) WHERE e.urn = $urn
|
|
MERGE (e)-[r:INCLUDES_RELIGIOUS_CHARACTER]->(rc)
|
|
RETURN r
|
|
"""
|
|
result = tx.run(religious_query, code=religious_code, name=religious_name, urn=establishment_node['urn'])
|
|
if result.single():
|
|
relationships_created += 1
|
|
|
|
# Diocese
|
|
diocese_code = row.get('Diocese (code)', '').strip()
|
|
diocese_name = row.get('Diocese (name)', '').strip()
|
|
if diocese_code and diocese_name and diocese_name != 'Not applicable':
|
|
diocese_query = """
|
|
MERGE (d:Diocese {code: $code, name: $name})
|
|
WITH d
|
|
MATCH (e:Establishment) WHERE e.urn = $urn
|
|
MERGE (e)-[r:BELONGS_TO_DIOCESE]->(d)
|
|
RETURN r
|
|
"""
|
|
result = tx.run(diocese_query, code=diocese_code, name=diocese_name, urn=establishment_node['urn'])
|
|
if result.single():
|
|
relationships_created += 1
|
|
|
|
# Special Classes
|
|
special_classes = row.get('SpecialClasses', '').strip()
|
|
if special_classes and special_classes != 'Not applicable' and special_classes != '0':
|
|
special_query = """
|
|
MERGE (sc:SpecialClasses {name: $name})
|
|
WITH sc
|
|
MATCH (e:Establishment) WHERE e.urn = $urn
|
|
MERGE (e)-[r:HAS_SPECIAL_CLASSES]->(sc)
|
|
RETURN r
|
|
"""
|
|
result = tx.run(special_query, name=special_classes, urn=establishment_node['urn'])
|
|
if result.single():
|
|
relationships_created += 1
|
|
|
|
# Sixth Form
|
|
sixth_form = row.get('SixthForm', '').strip()
|
|
if sixth_form and sixth_form != 'Not applicable':
|
|
sixth_form_query = """
|
|
MERGE (sf:SixthForm {name: $name})
|
|
WITH sf
|
|
MATCH (e:Establishment) WHERE e.urn = $urn
|
|
MERGE (e)-[r:HAS_SIXTH_FORM]->(sf)
|
|
RETURN r
|
|
"""
|
|
result = tx.run(sixth_form_query, name=sixth_form, urn=establishment_node['urn'])
|
|
if result.single():
|
|
relationships_created += 1
|
|
|
|
# Geographical hierarchy: Locality -> Town -> County -> Country
|
|
# Country
|
|
country_name = row.get('Country (name)', '').strip()
|
|
if country_name and country_name != 'Not applicable':
|
|
country_query = """
|
|
MERGE (c:Country {name: $name})
|
|
RETURN c
|
|
"""
|
|
tx.run(country_query, name=country_name)
|
|
|
|
# County
|
|
county_name = row.get('County (name)', '').strip()
|
|
if county_name and county_name != 'Not applicable':
|
|
county_query = """
|
|
MERGE (co:County {name: $name})
|
|
WITH co
|
|
MATCH (c:Country) WHERE c.name = $country_name
|
|
MERGE (co)-[r:IS_IN_COUNTRY]->(c)
|
|
RETURN co
|
|
"""
|
|
tx.run(county_query, name=county_name, country_name=country_name)
|
|
|
|
# Town
|
|
town_name = row.get('Town', '').strip()
|
|
if town_name and town_name != 'Not applicable':
|
|
town_query = """
|
|
MERGE (t:Town {name: $name})
|
|
WITH t
|
|
MATCH (co:County) WHERE co.name = $county_name
|
|
MERGE (t)-[r:IS_IN_COUNTY]->(co)
|
|
RETURN t
|
|
"""
|
|
tx.run(town_query, name=town_name, county_name=county_name)
|
|
|
|
# Locality
|
|
locality_name = row.get('Locality', '').strip()
|
|
if locality_name and locality_name != 'Not applicable':
|
|
locality_query = """
|
|
MERGE (l:Locality {name: $name})
|
|
WITH l
|
|
MATCH (t:Town) WHERE t.name = $town_name
|
|
MERGE (l)-[r:IS_IN_TOWN]->(t)
|
|
RETURN l
|
|
"""
|
|
tx.run(locality_query, name=locality_name, town_name=town_name)
|
|
|
|
# Government Office Region
|
|
gor_name = row.get('GOR (name)', '').strip()
|
|
if gor_name and gor_name != 'Not applicable':
|
|
gor_query = """
|
|
MERGE (gor:GovernmentOfficeRegion {name: $name})
|
|
WITH gor
|
|
MATCH (e:Establishment) WHERE e.urn = $urn
|
|
MERGE (e)-[r:IS_IN_GOVERNMENT_OFFICE_REGION]->(gor)
|
|
RETURN r
|
|
"""
|
|
result = tx.run(gor_query, name=gor_name, urn=establishment_node['urn'])
|
|
if result.single():
|
|
relationships_created += 1
|
|
|
|
# District Administrative
|
|
district_name = row.get('DistrictAdministrative (name)', '').strip()
|
|
if district_name and district_name != 'Not applicable':
|
|
district_query = """
|
|
MERGE (da:DistrictAdministrative {name: $name})
|
|
WITH da
|
|
MATCH (e:Establishment) WHERE e.urn = $urn
|
|
MERGE (e)-[r:IS_IN_DISTRICT_ADMINISTRATIVE]->(da)
|
|
RETURN r
|
|
"""
|
|
result = tx.run(district_query, name=district_name, urn=establishment_node['urn'])
|
|
if result.single():
|
|
relationships_created += 1
|
|
|
|
# Establishment location relationships
|
|
if locality_name and locality_name != 'Not applicable':
|
|
location_query = """
|
|
MATCH (e:Establishment) WHERE e.urn = $urn
|
|
MATCH (l:Locality) WHERE l.name = $locality_name
|
|
MERGE (e)-[r:IS_LOCATED_IN_LOCALITY]->(l)
|
|
RETURN r
|
|
"""
|
|
result = tx.run(location_query, urn=establishment_node['urn'], locality_name=locality_name)
|
|
if result.single():
|
|
relationships_created += 1
|
|
|
|
return relationships_created
|
|
|
|
def _process_headers(self, fieldnames: List[str]) -> None:
|
|
"""Process CSV headers to understand the data structure"""
|
|
logger.info(f"Processing {len(fieldnames)} columns from Edubase data")
|
|
|
|
# Group related columns
|
|
self.column_groups = {
|
|
'establishment': ['URN', 'EstablishmentNumber', 'EstablishmentName'],
|
|
'local_authority': ['LA (code)', 'LA (name)'],
|
|
'establishment_type': ['TypeOfEstablishment (code)', 'TypeOfEstablishment (name)'],
|
|
'establishment_type_group': ['EstablishmentTypeGroup (code)', 'EstablishmentTypeGroup (name)'],
|
|
'establishment_status': ['EstablishmentStatus (code)', 'EstablishmentStatus (name)'],
|
|
'phase_education': ['PhaseOfEducation (code)', 'PhaseOfEducation (name)', 'StatutoryLowAge', 'StatutoryHighAge'],
|
|
'boarders': ['Boarders (code)', 'Boarders (name)'],
|
|
'nursery': ['NurseryProvision (name)'],
|
|
'sixth_form': ['OfficialSixthForm (code)', 'OfficialSixthForm (name)'],
|
|
'gender': ['Gender (code)', 'Gender (name)'],
|
|
'religious': ['ReligiousCharacter (code)', 'ReligiousCharacter (name)', 'ReligiousEthos (name)'],
|
|
'diocese': ['Diocese (code)', 'Diocese (name)'],
|
|
'admissions': ['AdmissionsPolicy (code)', 'AdmissionsPolicy (name)'],
|
|
'capacity': ['SchoolCapacity'],
|
|
'special_classes': ['SpecialClasses (code)', 'SpecialClasses (name)'],
|
|
'census': ['CensusDate', 'NumberOfPupils', 'NumberOfBoys', 'NumberOfGirls', 'PercentageFSM'],
|
|
'trust': ['TrustSchoolFlag (code)', 'TrustSchoolFlag (name)', 'Trusts (code)', 'Trusts (name)'],
|
|
'sponsor': ['SchoolSponsorFlag (name)', 'SchoolSponsors (name)'],
|
|
'federation': ['FederationFlag (name)', 'Federations (code)', 'Federations (name)'],
|
|
'ukprn': ['UKPRN'],
|
|
'fe_type': ['FEHEIdentifier', 'FurtherEducationType (name)'],
|
|
'dates': ['OpenDate', 'CloseDate', 'LastChangedDate'],
|
|
'address': ['Street', 'Locality', 'Address3', 'Town', 'County (name)', 'Postcode'],
|
|
'contact': ['SchoolWebsite', 'TelephoneNum'],
|
|
'head_teacher': ['HeadTitle (name)', 'HeadFirstName', 'HeadLastName', 'HeadPreferredJobTitle'],
|
|
'inspection': ['BSOInspectorateName (name)', 'InspectorateReport', 'DateOfLastInspectionVisit', 'NextInspectionVisit'],
|
|
'special_provision': ['TeenMoth (name)', 'TeenMothPlaces', 'CCF (name)', 'SENPRU (name)', 'EBD (name)', 'PlacesPRU'],
|
|
'ft_provision': ['FTProv (name)', 'EdByOther (name)', 'Section41Approved (name)'],
|
|
'sen_provision': ['SEN1 (name)', 'SEN2 (name)', 'SEN3 (name)', 'SEN4 (name)', 'SEN5 (name)',
|
|
'SEN6 (name)', 'SEN7 (name)', 'SEN8 (name)', 'SEN9 (name)', 'SEN10 (name)',
|
|
'SEN11 (name)', 'SEN12 (name)', 'SEN13 (name)'],
|
|
'resourced_provision': ['TypeOfResourcedProvision (name)', 'ResourcedProvisionOnRoll', 'ResourcedProvisionCapacity'],
|
|
'sen_unit': ['SenUnitOnRoll', 'SenUnitCapacity'],
|
|
'geography': ['GOR (code)', 'GOR (name)', 'DistrictAdministrative (code)', 'DistrictAdministrative (name)',
|
|
'AdministrativeWard (code)', 'AdministrativeWard (name)', 'ParliamentaryConstituency (code)',
|
|
'ParliamentaryConstituency (name)', 'UrbanRural (code)', 'UrbanRural (name)'],
|
|
'gss_codes': ['GSSLACode (name)', 'Easting', 'Northing', 'MSOA (name)', 'LSOA (name)'],
|
|
'inspection_details': ['InspectorateName (name)', 'SENStat', 'SENNoStat'],
|
|
'boarding': ['BoardingEstablishment (name)'],
|
|
'props': ['PropsName'],
|
|
'previous': ['PreviousLA (code)', 'PreviousLA (name)', 'PreviousEstablishmentNumber'],
|
|
'country': ['Country (name)'],
|
|
'uprn': ['UPRN'],
|
|
'site': ['SiteName'],
|
|
'qab': ['QABName (code)', 'QABName (name)', 'EstablishmentAccredited (code)', 'EstablishmentAccredited (name)',
|
|
'QABReport', 'AccreditationExpiryDate'],
|
|
'ch_number': ['CHNumber'],
|
|
'msoa_lsoa_codes': ['MSOA (code)', 'LSOA (code)'],
|
|
'fsm': ['FSM']
|
|
}
|
|
|
|
def _process_edubase_row(self, row: Dict[str, str]) -> Tuple[List[Tuple[str, Dict[str, Any]]], List[Tuple[str, str, str]]]:
|
|
"""Process a single Edubase data row and return a tuple of (nodes, relationships)"""
|
|
nodes = []
|
|
relationships = []
|
|
|
|
try:
|
|
# Create main establishment node
|
|
establishment_props = self._extract_establishment_properties(row)
|
|
if establishment_props:
|
|
nodes.append(('Establishment', establishment_props))
|
|
|
|
# Create related nodes and relationships
|
|
self._create_related_nodes_and_relationships(row, nodes, relationships)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing row {row.get('URN', 'unknown')}: {str(e)}")
|
|
|
|
return nodes, relationships
|
|
|
|
def _extract_establishment_properties(self, row: Dict[str, str]) -> Optional[Dict[str, Any]]:
|
|
"""Extract properties for the main establishment node"""
|
|
urn = row.get('URN', '').strip()
|
|
if not urn:
|
|
return None
|
|
|
|
props = {
|
|
'urn': urn,
|
|
'establishmentNumber': row.get('EstablishmentNumber', '').strip(),
|
|
'establishmentName': row.get('EstablishmentName', '').strip(),
|
|
'openDate': self._parse_date(row.get('OpenDate', '')),
|
|
'closeDate': self._parse_date(row.get('CloseDate', '')),
|
|
'lastChangedDate': self._parse_date(row.get('LastChangedDate', '')),
|
|
'schoolCapacity': self._parse_int(row.get('SchoolCapacity', '')),
|
|
'numberOfPupils': self._parse_int(row.get('NumberOfPupils', '')),
|
|
'numberOfBoys': self._parse_int(row.get('NumberOfBoys', '')),
|
|
'numberOfGirls': self._parse_int(row.get('NumberOfGirls', '')),
|
|
'percentageFSM': self._parse_float(row.get('PercentageFSM', '')),
|
|
'statutoryLowAge': self._parse_int(row.get('StatutoryLowAge', '')),
|
|
'statutoryHighAge': self._parse_int(row.get('StatutoryHighAge', '')),
|
|
'easting': self._parse_int(row.get('Easting', '')),
|
|
'northing': self._parse_int(row.get('Northing', '')),
|
|
'street': row.get('Street', '').strip(),
|
|
'locality': row.get('Locality', '').strip(),
|
|
'address3': row.get('Address3', '').strip(),
|
|
'town': row.get('Town', '').strip(),
|
|
'county': row.get('County (name)', '').strip(),
|
|
'postcode': row.get('Postcode', '').strip(),
|
|
'schoolWebsite': row.get('SchoolWebsite', '').strip(),
|
|
'telephoneNum': row.get('TelephoneNum', '').strip(),
|
|
'headTitle': row.get('HeadTitle (name)', '').strip(),
|
|
'headFirstName': row.get('HeadFirstName', '').strip(),
|
|
'headLastName': row.get('HeadLastName', '').strip(),
|
|
'headPreferredJobTitle': row.get('HeadPreferredJobTitle', '').strip(),
|
|
'censusDate': self._parse_date(row.get('CensusDate', '')),
|
|
'teenMothPlaces': self._parse_int(row.get('TeenMothPlaces', '')),
|
|
'placesPRU': self._parse_int(row.get('PlacesPRU', '')),
|
|
'resourcedProvisionOnRoll': self._parse_int(row.get('ResourcedProvisionOnRoll', '')),
|
|
'resourcedProvisionCapacity': self._parse_int(row.get('ResourcedProvisionCapacity', '')),
|
|
'senUnitOnRoll': self._parse_int(row.get('SenUnitOnRoll', '')),
|
|
'senUnitCapacity': self._parse_int(row.get('SenUnitCapacity', '')),
|
|
'fsm': self._parse_int(row.get('FSM', '')),
|
|
'ukprn': row.get('UKPRN', '').strip(),
|
|
'uprn': row.get('UPRN', '').strip(),
|
|
'chNumber': row.get('CHNumber', '').strip()
|
|
}
|
|
|
|
# Remove empty/None values
|
|
props = {k: v for k, v in props.items() if v is not None and v != '' and v != 'Not applicable'}
|
|
|
|
return props
|
|
|
|
def _create_related_nodes_and_relationships(self, row: Dict[str, str], nodes: List[Tuple[str, Dict[str, Any]]], relationships: List[Tuple[str, str, str]]) -> None:
|
|
"""Create related nodes and relationships for an establishment"""
|
|
urn = row.get('URN', '').strip()
|
|
if not urn:
|
|
return
|
|
|
|
# Local Authority
|
|
la_code = row.get('LA (code)', '').strip()
|
|
la_name = row.get('LA (name)', '').strip()
|
|
if la_code and la_name and la_name != 'Not applicable':
|
|
la_key = f"{la_code}_{la_name}"
|
|
if la_key not in self.created_nodes['LocalAuthority']:
|
|
nodes.append(('LocalAuthority', {'code': la_code, 'name': la_name}))
|
|
self.created_nodes['LocalAuthority'].add(la_key)
|
|
relationships.append(('IS_CONTROLLED_BY_LOCAL_AUTHORITY', urn, la_key))
|
|
|
|
# Establishment Type
|
|
type_code = row.get('TypeOfEstablishment (code)', '').strip()
|
|
type_name = row.get('TypeOfEstablishment (name)', '').strip()
|
|
if type_code and type_name and type_name != 'Not applicable':
|
|
type_key = f"{type_code}_{type_name}"
|
|
if type_key not in self.created_nodes['EstablishmentType']:
|
|
nodes.append(('EstablishmentType', {'code': type_code, 'name': type_name}))
|
|
self.created_nodes['EstablishmentType'].add(type_key)
|
|
relationships.append(('IS_ESTABLISHMENT_TYPE', urn, type_key))
|
|
|
|
# Establishment Type Group
|
|
group_code = row.get('EstablishmentTypeGroup (code)', '').strip()
|
|
group_name = row.get('EstablishmentTypeGroup (name)', '').strip()
|
|
if group_code and group_name and group_name != 'Not applicable':
|
|
group_key = f"{group_code}_{group_name}"
|
|
if group_key not in self.created_nodes['EstablishmentTypeGroup']:
|
|
nodes.append(('EstablishmentTypeGroup', {'code': group_code, 'name': group_name}))
|
|
self.created_nodes['EstablishmentTypeGroup'].add(group_key)
|
|
relationships.append(('IS_ESTABLISHMENT_TYPE_GROUP', urn, group_key))
|
|
|
|
# Establishment Status
|
|
status_code = row.get('EstablishmentStatus (code)', '').strip()
|
|
status_name = row.get('EstablishmentStatus (name)', '').strip()
|
|
if status_code and status_name and status_name != 'Not applicable':
|
|
status_key = f"{status_code}_{status_name}"
|
|
if status_key not in self.created_nodes['EstablishmentStatus']:
|
|
nodes.append(('EstablishmentStatus', {'code': status_code, 'name': status_name}))
|
|
self.created_nodes['EstablishmentStatus'].add(status_key)
|
|
relationships.append(('CURRENT_ESTABLISHMENT_STATUS', urn, status_key))
|
|
|
|
# Phase of Education
|
|
phase_code = row.get('PhaseOfEducation (code)', '').strip()
|
|
phase_name = row.get('PhaseOfEducation (name)', '').strip()
|
|
if phase_code and phase_name and phase_name != 'Not applicable':
|
|
phase_key = f"{phase_code}_{phase_name}"
|
|
if phase_key not in self.created_nodes['PhaseOfEducation']:
|
|
nodes.append(('PhaseOfEducation', {'code': phase_code, 'name': phase_name}))
|
|
self.created_nodes['PhaseOfEducation'].add(phase_key)
|
|
relationships.append(('PROVIDES_PHASE_OF_EDUCATION', urn, phase_key))
|
|
|
|
# Gender
|
|
gender_code = row.get('Gender (code)', '').strip()
|
|
gender_name = row.get('Gender (name)', '').strip()
|
|
if gender_code and gender_name and gender_name != 'Not applicable':
|
|
gender_key = f"{gender_code}_{gender_name}"
|
|
if gender_key not in self.created_nodes['GenderType']:
|
|
nodes.append(('GenderType', {'code': gender_code, 'name': gender_name}))
|
|
self.created_nodes['GenderType'].add(gender_key)
|
|
relationships.append(('PROVIDES_FOR_GENDER_TYPE', urn, gender_key))
|
|
|
|
# Religious Character
|
|
rel_code = row.get('ReligiousCharacter (code)', '').strip()
|
|
rel_name = row.get('ReligiousCharacter (name)', '').strip()
|
|
if rel_code and rel_name and rel_name != 'Not applicable':
|
|
rel_key = f"{rel_code}_{rel_name}"
|
|
if rel_key not in self.created_nodes['ReligiousCharacter']:
|
|
nodes.append(('ReligiousCharacter', {'code': rel_code, 'name': rel_name}))
|
|
self.created_nodes['ReligiousCharacter'].add(rel_key)
|
|
relationships.append(('INCLUDES_RELIGIOUS_CHARACTER', urn, rel_key))
|
|
|
|
# Diocese
|
|
diocese_code = row.get('Diocese (code)', '').strip()
|
|
diocese_name = row.get('Diocese (name)', '').strip()
|
|
if diocese_code and diocese_name and diocese_name != 'Not applicable':
|
|
diocese_key = f"{diocese_code}_{diocese_name}"
|
|
if diocese_key not in self.created_nodes['Diocese']:
|
|
nodes.append(('Diocese', {'code': diocese_code, 'name': diocese_name}))
|
|
self.created_nodes['Diocese'].add(diocese_key)
|
|
relationships.append(('UNDER_DIOCESE', urn, diocese_key))
|
|
|
|
# Government Office Region
|
|
gor_code = row.get('GOR (code)', '').strip()
|
|
gor_name = row.get('GOR (name)', '').strip()
|
|
if gor_code and gor_name and gor_name != 'Not applicable':
|
|
gor_key = f"{gor_code}_{gor_name}"
|
|
if gor_key not in self.created_nodes['GovernmentOfficeRegion']:
|
|
nodes.append(('GovernmentOfficeRegion', {'code': gor_code, 'name': gor_name}))
|
|
self.created_nodes['GovernmentOfficeRegion'].add(gor_key)
|
|
relationships.append(('OVERSEEN_BY_GOVERNMENT_OFFICE_REGION', urn, gor_key))
|
|
|
|
# District Administrative
|
|
district_code = row.get('DistrictAdministrative (code)', '').strip()
|
|
district_name = row.get('DistrictAdministrative (name)', '').strip()
|
|
if district_code and district_name and district_name != 'Not applicable':
|
|
district_key = f"{district_code}_{district_name}"
|
|
if district_key not in self.created_nodes['DistrictAdministrative']:
|
|
nodes.append(('DistrictAdministrative', {'code': district_code, 'name': district_name}))
|
|
self.created_nodes['DistrictAdministrative'].add(district_key)
|
|
relationships.append(('WITHIN_DISTRICT_ADMINISTRATIVE', urn, district_key))
|
|
|
|
# Country
|
|
country_name = row.get('Country (name)', '').strip()
|
|
if country_name and country_name != 'Not applicable':
|
|
if country_name not in self.created_nodes['Country']:
|
|
nodes.append(('Country', {'name': country_name}))
|
|
self.created_nodes['Country'].add(country_name)
|
|
relationships.append(('LOCATED_IN_COUNTRY', urn, country_name))
|
|
|
|
# County
|
|
county_name = row.get('County (name)', '').strip()
|
|
if county_name and county_name != 'Not applicable':
|
|
if county_name not in self.created_nodes['County']:
|
|
nodes.append(('County', {'name': county_name}))
|
|
self.created_nodes['County'].add(county_name)
|
|
relationships.append(('LOCATED_IN_COUNTY', urn, county_name))
|
|
|
|
# County is in Country
|
|
if country_name and country_name != 'Not applicable':
|
|
relationships.append(('PART_OF_COUNTRY', county_name, country_name))
|
|
|
|
# Town
|
|
town_name = row.get('Town', '').strip()
|
|
if town_name and town_name != 'Not applicable':
|
|
if town_name not in self.created_nodes['Town']:
|
|
nodes.append(('Town', {'name': town_name}))
|
|
self.created_nodes['Town'].add(town_name)
|
|
relationships.append(('LOCATED_IN_TOWN', urn, town_name))
|
|
|
|
# Town is in County
|
|
if county_name and county_name != 'Not applicable':
|
|
relationships.append(('PART_OF_COUNTY', town_name, county_name))
|
|
|
|
# Locality
|
|
locality_name = row.get('Locality', '').strip()
|
|
if locality_name and locality_name != 'Not applicable':
|
|
if locality_name not in self.created_nodes['Locality']:
|
|
nodes.append(('Locality', {'name': locality_name}))
|
|
self.created_nodes['Locality'].add(locality_name)
|
|
relationships.append(('LOCATED_IN_LOCALITY', urn, locality_name))
|
|
|
|
# Locality is in Town
|
|
if town_name and town_name != 'Not applicable':
|
|
relationships.append(('PART_OF_TOWN', locality_name, town_name))
|
|
|
|
# Special Classes
|
|
special_classes_code = row.get('SpecialClasses (code)', '').strip()
|
|
special_classes_name = row.get('SpecialClasses (name)', '').strip()
|
|
if special_classes_code and special_classes_name and special_classes_name != 'Not applicable':
|
|
special_classes_key = f"{special_classes_code}_{special_classes_name}"
|
|
if special_classes_key not in self.created_nodes['SpecialClasses']:
|
|
nodes.append(('SpecialClasses', {'code': special_classes_code, 'name': special_classes_name}))
|
|
self.created_nodes['SpecialClasses'].add(special_classes_key)
|
|
relationships.append(('PROVIDES_SPECIAL_CLASSES', urn, special_classes_key))
|
|
|
|
# Further Education Type
|
|
fe_type_name = row.get('FurtherEducationType (name)', '').strip()
|
|
if fe_type_name and fe_type_name != 'Not applicable':
|
|
if fe_type_name not in self.created_nodes['FurtherEducationType']:
|
|
nodes.append(('FurtherEducationType', {'name': fe_type_name}))
|
|
self.created_nodes['FurtherEducationType'].add(fe_type_name)
|
|
relationships.append(('PROVIDES_FURTHER_EDUCATION_TYPE', urn, fe_type_name))
|
|
|
|
# Sixth Form
|
|
sixth_form_code = row.get('OfficialSixthForm (code)', '').strip()
|
|
sixth_form_name = row.get('OfficialSixthForm (name)', '').strip()
|
|
if sixth_form_code and sixth_form_name and sixth_form_name != 'Not applicable':
|
|
sixth_form_key = f"{sixth_form_code}_{sixth_form_name}"
|
|
if sixth_form_key not in self.created_nodes['SixthForm']:
|
|
nodes.append(('SixthForm', {'code': sixth_form_code, 'name': sixth_form_name}))
|
|
self.created_nodes['SixthForm'].add(sixth_form_key)
|
|
relationships.append(('PROVIDES_SIXTH_FORM', urn, sixth_form_key))
|
|
|
|
def _create_all_nodes(self, all_nodes: List[Tuple[str, Dict[str, Any]]]) -> Dict[str, Any]:
|
|
"""Create all nodes from the collected list of (label, properties) tuples"""
|
|
node_map = {}
|
|
try:
|
|
with self.neo4j_service.driver.session(database=self.db_name) as session:
|
|
for label, properties in all_nodes:
|
|
try:
|
|
if label == 'Establishment':
|
|
# For establishments, use URN as key
|
|
key = properties.get('urn')
|
|
if key:
|
|
node = create_node_direct(session, label, properties)
|
|
if node:
|
|
node_map[key] = node
|
|
logger.debug(f"Created {label} node with key: {key}")
|
|
else:
|
|
# For other nodes, create and store for later relationship creation
|
|
node = create_node_direct(session, label, properties)
|
|
if node:
|
|
# Create a key for this node
|
|
if 'code' in properties and 'name' in properties:
|
|
key = f"{properties['code']}_{properties['name']}"
|
|
elif 'name' in properties:
|
|
key = properties['name']
|
|
else:
|
|
key = str(node.id)
|
|
node_map[key] = node
|
|
logger.debug(f"Created {label} node with key: {key}")
|
|
except Exception as e:
|
|
logger.error(f"Failed to create {label} node: {str(e)}")
|
|
return node_map
|
|
except Exception as e:
|
|
logger.error(f"Error creating all nodes: {str(e)}")
|
|
return {}
|
|
|
|
def _create_all_relationships(self, all_relationships: List[Tuple[str, str, str]], node_map: Dict[str, Any]) -> int:
|
|
"""Create all relationships from the collected list of (rel_type, start_key, end_key) tuples"""
|
|
relationships_created = 0
|
|
try:
|
|
with self.neo4j_service.driver.session(database=self.db_name) as session:
|
|
for rel_type, start_key, end_key in all_relationships:
|
|
start_node = node_map.get(start_key)
|
|
end_node = node_map.get(end_key)
|
|
|
|
if start_node and end_node:
|
|
try:
|
|
# Use property-based matching instead of deprecated ID() function
|
|
# This is the recommended approach for Neo4j 5.x
|
|
|
|
# Get properties from the nodes
|
|
start_props = dict(start_node)
|
|
end_props = dict(end_node)
|
|
|
|
# Get labels
|
|
start_label = list(start_node.labels)[0] if start_node.labels else 'Node'
|
|
end_label = list(end_node.labels)[0] if end_node.labels else 'Node'
|
|
|
|
# Create a unique property-based query
|
|
# For establishments, use URN (unique identifier)
|
|
if start_label == 'Establishment' and 'urn' in start_props:
|
|
start_match = f"n1:{start_label} {{urn: $start_urn}}"
|
|
start_params = {'start_urn': start_props['urn']}
|
|
else:
|
|
# For other nodes, use the combined code_name key format
|
|
# This matches how we create the nodes in the first place
|
|
if 'code' in start_props and 'name' in start_props:
|
|
start_match = f"n1:{start_label} {{code: $start_code, name: $start_name}}"
|
|
start_params = {'start_code': start_props['code'], 'start_name': start_props['name']}
|
|
elif 'name' in start_props:
|
|
start_match = f"n1:{start_label} {{name: $start_name}}"
|
|
start_params = {'start_name': start_props['name']}
|
|
else:
|
|
print(f"ERROR: No unique property found for start node {start_label}: {start_props}")
|
|
continue
|
|
|
|
# Same for end node
|
|
if end_label == 'Establishment' and 'urn' in end_props:
|
|
end_match = f"n2:{end_label} {{urn: $end_urn}}"
|
|
end_params = {'end_urn': end_props['urn']}
|
|
else:
|
|
# For other nodes, use the combined code_name key format
|
|
if 'code' in end_props and 'name' in end_props:
|
|
end_match = f"n2:{end_label} {{code: $end_code, name: $end_name}}"
|
|
end_params = {'end_code': end_props['code'], 'end_name': end_props['name']}
|
|
elif 'name' in end_props:
|
|
end_match = f"n2:{end_label} {{name: $end_name}}"
|
|
end_params = {'end_name': end_props['name']}
|
|
else:
|
|
print(f"ERROR: No unique property found for end node {end_label}: {end_props}")
|
|
continue
|
|
|
|
# Combine parameters
|
|
params = {**start_params, **end_params}
|
|
|
|
# Create relationship using property-based matching
|
|
query = f"""
|
|
MATCH ({start_match}), ({end_match})
|
|
MERGE (n1)-[r:{rel_type}]->(n2)
|
|
RETURN r
|
|
"""
|
|
|
|
result = session.run(query, **params)
|
|
record = result.single()
|
|
|
|
if record and record["r"]:
|
|
self.created_relationships.add(f"{start_key}-{rel_type}-{end_key}")
|
|
relationships_created += 1
|
|
print(f"SUCCESS: Created relationship {rel_type} between {start_key} and {end_key}")
|
|
else:
|
|
print(f"FAILED: Could not create relationship {rel_type} between {start_key} and {end_key}")
|
|
|
|
except Exception as e:
|
|
print(f"ERROR: Exception creating relationship {rel_type} between {start_key} and {end_key}: {str(e)}")
|
|
else:
|
|
if not start_node:
|
|
logger.warning(f"Start node not found for relationship {rel_type} with key {start_key}")
|
|
if not end_node:
|
|
logger.warning(f"End node not found for relationship {rel_type} with key {end_key}")
|
|
return relationships_created
|
|
except Exception as e:
|
|
logger.error(f"Error creating all relationships: {str(e)}")
|
|
return 0
|
|
|
|
def _parse_date(self, date_str: str) -> Optional[str]:
|
|
"""Parse date string to ISO format"""
|
|
if not date_str or date_str.strip() == '' or date_str.strip() == 'Not applicable':
|
|
return None
|
|
|
|
try:
|
|
# Handle DD-MM-YYYY format
|
|
if '-' in date_str:
|
|
parts = date_str.split('-')
|
|
if len(parts) == 3:
|
|
day, month, year = parts
|
|
if len(year) == 2:
|
|
year = f"20{year}" if int(year) < 50 else f"19{year}"
|
|
return f"{year}-{month.zfill(2)}-{day.zfill(2)}"
|
|
except:
|
|
pass
|
|
|
|
return date_str.strip()
|
|
|
|
def _parse_int(self, int_str: str) -> Optional[int]:
|
|
"""Parse integer string"""
|
|
if not int_str or int_str.strip() == '' or int_str.strip() == 'Not applicable':
|
|
return None
|
|
|
|
try:
|
|
return int(int_str.strip())
|
|
except:
|
|
return None
|
|
|
|
def _parse_float(self, float_str: str) -> Optional[float]:
|
|
"""Parse float string"""
|
|
if not float_str or float_str.strip() == '' or float_str.strip() == 'Not applicable':
|
|
return None
|
|
|
|
try:
|
|
return float(float_str.strip())
|
|
except:
|
|
return None
|
|
|
|
def import_gais_data() -> Dict[str, Any]:
|
|
"""Import GAIS data into Neo4j database"""
|
|
logger.info("Starting GAIS data import...")
|
|
|
|
try:
|
|
importer = GAISDataImporter()
|
|
# Process all rows in the CSV file
|
|
result = importer.import_edubase_data_simple(test_mode=False)
|
|
|
|
if result["success"]:
|
|
logger.info("GAIS data import completed successfully!")
|
|
else:
|
|
logger.error(f"GAIS data import failed: {result['message']}")
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in GAIS data import: {str(e)}")
|
|
return {
|
|
"success": False,
|
|
"message": f"Error in GAIS data import: {str(e)}"
|
|
}
|