From c69451fba20c3d3915d38d227ad7bcb7d6a55638 Mon Sep 17 00:00:00 2001 From: kcar Date: Mon, 8 Jun 2026 02:15:05 +0100 Subject: [PATCH] [verified] add upload size and MIME guards (cherry picked from commit f5e05376f637f55b73e474cac8199529682ca398) --- modules/upload_validation.py | 99 ++++++++++++++++++++++ routers/database/files/files.py | 7 +- routers/database/files/files_simplified.py | 6 +- routers/exam/templates.py | 7 +- routers/simple_upload.py | 13 +-- tests/test_upload_validation.py | 54 ++++++++++++ 6 files changed, 169 insertions(+), 17 deletions(-) create mode 100644 modules/upload_validation.py create mode 100644 tests/test_upload_validation.py diff --git a/modules/upload_validation.py b/modules/upload_validation.py new file mode 100644 index 0000000..bd95de7 --- /dev/null +++ b/modules/upload_validation.py @@ -0,0 +1,99 @@ +"""Upload boundary validation shared by file-upload endpoints. + +E3 hardening: keep user-facing upload routes from buffering arbitrary data and +from accepting arbitrary MIME/types into Supabase storage. +""" +from __future__ import annotations + +import os +from typing import Iterable, Optional + +from fastapi import HTTPException, UploadFile + +# Conservative defaults: Classroom Copilot uploads are user documents/images. +# Exam scan uploads already have their own 50 MB PDF-only guard in routers.exam.batches. +MAX_UPLOAD_BYTES = int(os.getenv("CC_UPLOAD_MAX_BYTES", str(25 * 1024 * 1024))) +UPLOAD_CHUNK_BYTES = 1024 * 1024 + +ALLOWED_UPLOAD_MIME_TYPES = frozenset( + mt.strip().lower() + for mt in os.getenv( + "CC_UPLOAD_ALLOWED_MIME_TYPES", + ",".join( + [ + "application/pdf", + "image/png", + "image/jpeg", + "image/webp", + "image/gif", + "text/plain", + "text/csv", + "text/markdown", + "application/msword", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "application/vnd.ms-powerpoint", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + "application/vnd.ms-excel", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ] + ), + ).split(",") + if mt.strip() +) + +_PDF_MIME_TYPES = {"application/pdf", "application/x-pdf"} + + +def allowed_upload_mime_types_csv() -> str: + """Stable display string for evidence/errors without leaking config internals.""" + return ", ".join(sorted(ALLOWED_UPLOAD_MIME_TYPES)) + + +def _declared_mime(upload: UploadFile) -> str: + return (upload.content_type or "application/octet-stream").split(";", 1)[0].strip().lower() + + +def validate_upload_mime(upload: UploadFile, *, allowed_mime_types: Optional[Iterable[str]] = None) -> str: + """Validate client-declared upload MIME/type and return its normalised value.""" + declared = _declared_mime(upload) + allowed = {mt.lower() for mt in (allowed_mime_types or ALLOWED_UPLOAD_MIME_TYPES)} + if declared not in allowed: + raise HTTPException( + status_code=415, + detail=( + f"Unsupported upload type '{declared}'. Allowed MIME types: " + f"{', '.join(sorted(allowed))}" + ), + ) + return declared + + +async def read_upload_bytes( + upload: UploadFile, + *, + max_bytes: int = MAX_UPLOAD_BYTES, + allowed_mime_types: Optional[Iterable[str]] = None, +) -> tuple[bytes, str]: + """Validate MIME and read an UploadFile with a hard size ceiling.""" + mime_type = validate_upload_mime(upload, allowed_mime_types=allowed_mime_types) + chunks: list[bytes] = [] + total = 0 + while True: + chunk = await upload.read(UPLOAD_CHUNK_BYTES) + if not chunk: + break + total += len(chunk) + if total > max_bytes: + raise HTTPException(status_code=413, detail=f"Upload exceeds max size ({max_bytes} bytes)") + chunks.append(chunk) + return b"".join(chunks), mime_type + + +async def read_pdf_upload_bytes(upload: UploadFile, *, max_bytes: int = MAX_UPLOAD_BYTES) -> bytes: + """Read a PDF-only upload with size and lightweight magic-header validation.""" + data, _mime_type = await read_upload_bytes(upload, max_bytes=max_bytes, allowed_mime_types=_PDF_MIME_TYPES) + if not data: + raise HTTPException(status_code=400, detail="Uploaded PDF is empty") + if not data.startswith(b"%PDF-"): + raise HTTPException(status_code=415, detail="Uploaded file is not a valid PDF") + return data diff --git a/routers/database/files/files.py b/routers/database/files/files.py index 6d81b46..4cf37d5 100644 --- a/routers/database/files/files.py +++ b/routers/database/files/files.py @@ -12,6 +12,7 @@ from modules.auth.supabase_bearer import SupabaseBearer, verify_supabase_jwt_str from modules.logger_tool import initialise_logger from modules.database.supabase.utils.client import SupabaseServiceRoleClient from modules.database.supabase.utils.storage import StorageAdmin +from modules.upload_validation import read_upload_bytes from modules.document_processor import DocumentProcessor from modules.queue_system import ( enqueue_tika_task, enqueue_docling_task, enqueue_split_map_task, @@ -88,13 +89,13 @@ async def upload_file( # Stage DB row to get file_id staged_path = f"{cabinet_id}/staging/{uuid.uuid4()}" name = _safe_filename(path or file.filename) - file_bytes = await file.read() + file_bytes, mime_type = await read_upload_bytes(file) insert_res = client.supabase.table('files').insert({ 'cabinet_id': cabinet_id, 'name': name, 'path': staged_path, 'bucket': bucket, - 'mime_type': file.content_type, + 'mime_type': mime_type, 'uploaded_by': user_id, 'size_bytes': len(file_bytes), 'source': 'classroomcopilot-web' @@ -107,7 +108,7 @@ async def upload_file( # Final storage path: bucket/cabinet_id/file_id/file final_storage_path = f"{cabinet_id}/{file_id}/{name}" try: - storage.upload_file(bucket, final_storage_path, file_bytes, file.content_type or 'application/octet-stream', upsert=True) + storage.upload_file(bucket, final_storage_path, file_bytes, mime_type, upsert=True) except Exception as e: # cleanup staged row client.supabase.table('files').delete().eq('id', file_id).execute() diff --git a/routers/database/files/files_simplified.py b/routers/database/files/files_simplified.py index 2dd5397..d27afca 100644 --- a/routers/database/files/files_simplified.py +++ b/routers/database/files/files_simplified.py @@ -19,6 +19,7 @@ from fastapi.responses import JSONResponse from modules.auth.supabase_bearer import SupabaseBearer from modules.database.supabase.utils.client import SupabaseServiceRoleClient from modules.database.supabase.utils.storage import StorageAdmin +from modules.upload_validation import read_upload_bytes from modules.logger_tool import initialise_logger router = APIRouter() @@ -72,10 +73,9 @@ async def upload_file( if not user_id: raise HTTPException(status_code=401, detail="User ID required") - # Read file content - file_bytes = await file.read() + # Validate MIME/type and read file content with a hard size limit. + file_bytes, mime_type = await read_upload_bytes(file) file_size = len(file_bytes) - mime_type = file.content_type or 'application/octet-stream' filename = file.filename or path logger.info(f"📤 Simplified upload: {filename} ({file_size} bytes) for user {user_id}") diff --git a/routers/exam/templates.py b/routers/exam/templates.py index 4bb2171..5b19af6 100644 --- a/routers/exam/templates.py +++ b/routers/exam/templates.py @@ -28,6 +28,7 @@ from api.services.docling.regions import detect_response_regions_from_pdf from modules.database.services.exam_projection import project_template, project_template_safe from modules.database.supabase.utils.client import SupabaseServiceRoleClient from modules.database.supabase.utils.storage import StorageAdmin +from modules.upload_validation import read_pdf_upload_bytes from modules.logger_tool import initialise_logger from routers.exam.dependencies import ExamContext, get_exam_context, lookup_exam_code from routers.exam.schemas import ( @@ -164,11 +165,7 @@ async def _upload_template_source_file( institute_id: str, upload: UploadFile, ) -> str: - file_bytes = await upload.read() - if not file_bytes: - raise HTTPException(status_code=400, detail="Uploaded PDF is empty") - if upload.content_type and upload.content_type != "application/pdf": - raise HTTPException(status_code=400, detail="Uploaded file must be a PDF") + file_bytes = await read_pdf_upload_bytes(upload) service = SupabaseServiceRoleClient() storage = StorageAdmin() diff --git a/routers/simple_upload.py b/routers/simple_upload.py index 44e19bd..fbe4181 100644 --- a/routers/simple_upload.py +++ b/routers/simple_upload.py @@ -26,6 +26,7 @@ from fastapi.responses import JSONResponse from modules.auth.supabase_bearer import SupabaseBearer from modules.database.supabase.utils.client import SupabaseServiceRoleClient from modules.database.supabase.utils.storage import StorageAdmin +from modules.upload_validation import read_upload_bytes from modules.logger_tool import initialise_logger router = APIRouter() @@ -59,10 +60,9 @@ async def upload_single_file( if not user_id: raise HTTPException(status_code=401, detail="User ID required") - # Read file content - file_bytes = await file.read() + # Validate MIME/type and read file content with a hard size limit. + file_bytes, mime_type = await read_upload_bytes(file) file_size = len(file_bytes) - mime_type = file.content_type or 'application/octet-stream' filename = file.filename or path logger.info(f"📤 Simple upload: {filename} ({file_size} bytes) for user {user_id}") @@ -234,10 +234,9 @@ async def upload_directory( # Process each file for i, (file, relative_path) in enumerate(zip(files, relative_paths)): try: - # Read file content - file_bytes = await file.read() + # Validate MIME/type and read file content with a hard size limit. + file_bytes, mime_type = await read_upload_bytes(file) file_size = len(file_bytes) - mime_type = file.content_type or 'application/octet-stream' filename = file.filename or f"file_{i}" total_size += file_size @@ -291,6 +290,8 @@ async def upload_directory( logger.info(f"📄 Uploaded file {i+1}/{len(files)}: {relative_path}") + except HTTPException: + raise except Exception as e: logger.error(f"Failed to upload file {relative_path}: {e}") # Continue with other files, don't fail entire upload diff --git a/tests/test_upload_validation.py b/tests/test_upload_validation.py new file mode 100644 index 0000000..5cd2f00 --- /dev/null +++ b/tests/test_upload_validation.py @@ -0,0 +1,54 @@ +import asyncio + +import pytest +from fastapi import HTTPException + +from modules.upload_validation import MAX_UPLOAD_BYTES, read_pdf_upload_bytes, read_upload_bytes + + +class FakeUpload: + def __init__(self, data: bytes, content_type: str, filename: str = "file.bin"): + self._data = data + self._pos = 0 + self.content_type = content_type + self.filename = filename + + async def read(self, size: int = -1) -> bytes: + if self._pos >= len(self._data): + return b"" + if size is None or size < 0: + size = len(self._data) - self._pos + chunk = self._data[self._pos : self._pos + size] + self._pos += len(chunk) + return chunk + + +def run(coro): + return asyncio.run(coro) + + +def test_valid_pdf_upload_passes_and_returns_mime(): + data, mime = run(read_upload_bytes(FakeUpload(b"%PDF-1.7\n", "application/pdf"))) + assert data.startswith(b"%PDF-") + assert mime == "application/pdf" + + +def test_disallowed_mime_rejected_with_415(): + with pytest.raises(HTTPException) as exc: + run(read_upload_bytes(FakeUpload(b"print(1)", "application/x-python"))) + assert exc.value.status_code == 415 + assert "Unsupported upload type" in exc.value.detail + + +def test_oversize_upload_rejected_with_413(): + with pytest.raises(HTTPException) as exc: + run(read_upload_bytes(FakeUpload(b"x" * (MAX_UPLOAD_BYTES + 1), "text/plain"))) + assert exc.value.status_code == 413 + assert "exceeds max size" in exc.value.detail + + +def test_pdf_helper_rejects_spoofed_pdf_mime(): + with pytest.raises(HTTPException) as exc: + run(read_pdf_upload_bytes(FakeUpload(b"not a pdf", "application/pdf"))) + assert exc.value.status_code == 415 + assert "not a valid PDF" in exc.value.detail