[verified] add upload size and MIME guards
Some checks failed
api-ci-deploy / test-build-deploy (push) Has been cancelled

(cherry picked from commit f5e05376f637f55b73e474cac8199529682ca398)
This commit is contained in:
kcar 2026-06-08 02:15:05 +01:00 committed by CC Worker
parent e98fed661f
commit c69451fba2
6 changed files with 169 additions and 17 deletions

View File

@ -0,0 +1,99 @@
"""Upload boundary validation shared by file-upload endpoints.
E3 hardening: keep user-facing upload routes from buffering arbitrary data and
from accepting arbitrary MIME/types into Supabase storage.
"""
from __future__ import annotations
import os
from typing import Iterable, Optional
from fastapi import HTTPException, UploadFile
# Conservative defaults: Classroom Copilot uploads are user documents/images.
# Exam scan uploads already have their own 50 MB PDF-only guard in routers.exam.batches.
MAX_UPLOAD_BYTES = int(os.getenv("CC_UPLOAD_MAX_BYTES", str(25 * 1024 * 1024)))
UPLOAD_CHUNK_BYTES = 1024 * 1024
ALLOWED_UPLOAD_MIME_TYPES = frozenset(
mt.strip().lower()
for mt in os.getenv(
"CC_UPLOAD_ALLOWED_MIME_TYPES",
",".join(
[
"application/pdf",
"image/png",
"image/jpeg",
"image/webp",
"image/gif",
"text/plain",
"text/csv",
"text/markdown",
"application/msword",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.ms-powerpoint",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.ms-excel",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
]
),
).split(",")
if mt.strip()
)
_PDF_MIME_TYPES = {"application/pdf", "application/x-pdf"}
def allowed_upload_mime_types_csv() -> str:
"""Stable display string for evidence/errors without leaking config internals."""
return ", ".join(sorted(ALLOWED_UPLOAD_MIME_TYPES))
def _declared_mime(upload: UploadFile) -> str:
return (upload.content_type or "application/octet-stream").split(";", 1)[0].strip().lower()
def validate_upload_mime(upload: UploadFile, *, allowed_mime_types: Optional[Iterable[str]] = None) -> str:
"""Validate client-declared upload MIME/type and return its normalised value."""
declared = _declared_mime(upload)
allowed = {mt.lower() for mt in (allowed_mime_types or ALLOWED_UPLOAD_MIME_TYPES)}
if declared not in allowed:
raise HTTPException(
status_code=415,
detail=(
f"Unsupported upload type '{declared}'. Allowed MIME types: "
f"{', '.join(sorted(allowed))}"
),
)
return declared
async def read_upload_bytes(
upload: UploadFile,
*,
max_bytes: int = MAX_UPLOAD_BYTES,
allowed_mime_types: Optional[Iterable[str]] = None,
) -> tuple[bytes, str]:
"""Validate MIME and read an UploadFile with a hard size ceiling."""
mime_type = validate_upload_mime(upload, allowed_mime_types=allowed_mime_types)
chunks: list[bytes] = []
total = 0
while True:
chunk = await upload.read(UPLOAD_CHUNK_BYTES)
if not chunk:
break
total += len(chunk)
if total > max_bytes:
raise HTTPException(status_code=413, detail=f"Upload exceeds max size ({max_bytes} bytes)")
chunks.append(chunk)
return b"".join(chunks), mime_type
async def read_pdf_upload_bytes(upload: UploadFile, *, max_bytes: int = MAX_UPLOAD_BYTES) -> bytes:
"""Read a PDF-only upload with size and lightweight magic-header validation."""
data, _mime_type = await read_upload_bytes(upload, max_bytes=max_bytes, allowed_mime_types=_PDF_MIME_TYPES)
if not data:
raise HTTPException(status_code=400, detail="Uploaded PDF is empty")
if not data.startswith(b"%PDF-"):
raise HTTPException(status_code=415, detail="Uploaded file is not a valid PDF")
return data

View File

@ -12,6 +12,7 @@ from modules.auth.supabase_bearer import SupabaseBearer, verify_supabase_jwt_str
from modules.logger_tool import initialise_logger
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
from modules.database.supabase.utils.storage import StorageAdmin
from modules.upload_validation import read_upload_bytes
from modules.document_processor import DocumentProcessor
from modules.queue_system import (
enqueue_tika_task, enqueue_docling_task, enqueue_split_map_task,
@ -88,13 +89,13 @@ async def upload_file(
# Stage DB row to get file_id
staged_path = f"{cabinet_id}/staging/{uuid.uuid4()}"
name = _safe_filename(path or file.filename)
file_bytes = await file.read()
file_bytes, mime_type = await read_upload_bytes(file)
insert_res = client.supabase.table('files').insert({
'cabinet_id': cabinet_id,
'name': name,
'path': staged_path,
'bucket': bucket,
'mime_type': file.content_type,
'mime_type': mime_type,
'uploaded_by': user_id,
'size_bytes': len(file_bytes),
'source': 'classroomcopilot-web'
@ -107,7 +108,7 @@ async def upload_file(
# Final storage path: bucket/cabinet_id/file_id/file
final_storage_path = f"{cabinet_id}/{file_id}/{name}"
try:
storage.upload_file(bucket, final_storage_path, file_bytes, file.content_type or 'application/octet-stream', upsert=True)
storage.upload_file(bucket, final_storage_path, file_bytes, mime_type, upsert=True)
except Exception as e:
# cleanup staged row
client.supabase.table('files').delete().eq('id', file_id).execute()

View File

@ -19,6 +19,7 @@ from fastapi.responses import JSONResponse
from modules.auth.supabase_bearer import SupabaseBearer
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
from modules.database.supabase.utils.storage import StorageAdmin
from modules.upload_validation import read_upload_bytes
from modules.logger_tool import initialise_logger
router = APIRouter()
@ -72,10 +73,9 @@ async def upload_file(
if not user_id:
raise HTTPException(status_code=401, detail="User ID required")
# Read file content
file_bytes = await file.read()
# Validate MIME/type and read file content with a hard size limit.
file_bytes, mime_type = await read_upload_bytes(file)
file_size = len(file_bytes)
mime_type = file.content_type or 'application/octet-stream'
filename = file.filename or path
logger.info(f"📤 Simplified upload: {filename} ({file_size} bytes) for user {user_id}")

View File

@ -28,6 +28,7 @@ from api.services.docling.regions import detect_response_regions_from_pdf
from modules.database.services.exam_projection import project_template, project_template_safe
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
from modules.database.supabase.utils.storage import StorageAdmin
from modules.upload_validation import read_pdf_upload_bytes
from modules.logger_tool import initialise_logger
from routers.exam.dependencies import ExamContext, get_exam_context, lookup_exam_code
from routers.exam.schemas import (
@ -164,11 +165,7 @@ async def _upload_template_source_file(
institute_id: str,
upload: UploadFile,
) -> str:
file_bytes = await upload.read()
if not file_bytes:
raise HTTPException(status_code=400, detail="Uploaded PDF is empty")
if upload.content_type and upload.content_type != "application/pdf":
raise HTTPException(status_code=400, detail="Uploaded file must be a PDF")
file_bytes = await read_pdf_upload_bytes(upload)
service = SupabaseServiceRoleClient()
storage = StorageAdmin()

View File

@ -26,6 +26,7 @@ from fastapi.responses import JSONResponse
from modules.auth.supabase_bearer import SupabaseBearer
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
from modules.database.supabase.utils.storage import StorageAdmin
from modules.upload_validation import read_upload_bytes
from modules.logger_tool import initialise_logger
router = APIRouter()
@ -59,10 +60,9 @@ async def upload_single_file(
if not user_id:
raise HTTPException(status_code=401, detail="User ID required")
# Read file content
file_bytes = await file.read()
# Validate MIME/type and read file content with a hard size limit.
file_bytes, mime_type = await read_upload_bytes(file)
file_size = len(file_bytes)
mime_type = file.content_type or 'application/octet-stream'
filename = file.filename or path
logger.info(f"📤 Simple upload: {filename} ({file_size} bytes) for user {user_id}")
@ -234,10 +234,9 @@ async def upload_directory(
# Process each file
for i, (file, relative_path) in enumerate(zip(files, relative_paths)):
try:
# Read file content
file_bytes = await file.read()
# Validate MIME/type and read file content with a hard size limit.
file_bytes, mime_type = await read_upload_bytes(file)
file_size = len(file_bytes)
mime_type = file.content_type or 'application/octet-stream'
filename = file.filename or f"file_{i}"
total_size += file_size
@ -291,6 +290,8 @@ async def upload_directory(
logger.info(f"📄 Uploaded file {i+1}/{len(files)}: {relative_path}")
except HTTPException:
raise
except Exception as e:
logger.error(f"Failed to upload file {relative_path}: {e}")
# Continue with other files, don't fail entire upload

View File

@ -0,0 +1,54 @@
import asyncio
import pytest
from fastapi import HTTPException
from modules.upload_validation import MAX_UPLOAD_BYTES, read_pdf_upload_bytes, read_upload_bytes
class FakeUpload:
def __init__(self, data: bytes, content_type: str, filename: str = "file.bin"):
self._data = data
self._pos = 0
self.content_type = content_type
self.filename = filename
async def read(self, size: int = -1) -> bytes:
if self._pos >= len(self._data):
return b""
if size is None or size < 0:
size = len(self._data) - self._pos
chunk = self._data[self._pos : self._pos + size]
self._pos += len(chunk)
return chunk
def run(coro):
return asyncio.run(coro)
def test_valid_pdf_upload_passes_and_returns_mime():
data, mime = run(read_upload_bytes(FakeUpload(b"%PDF-1.7\n", "application/pdf")))
assert data.startswith(b"%PDF-")
assert mime == "application/pdf"
def test_disallowed_mime_rejected_with_415():
with pytest.raises(HTTPException) as exc:
run(read_upload_bytes(FakeUpload(b"print(1)", "application/x-python")))
assert exc.value.status_code == 415
assert "Unsupported upload type" in exc.value.detail
def test_oversize_upload_rejected_with_413():
with pytest.raises(HTTPException) as exc:
run(read_upload_bytes(FakeUpload(b"x" * (MAX_UPLOAD_BYTES + 1), "text/plain")))
assert exc.value.status_code == 413
assert "exceeds max size" in exc.value.detail
def test_pdf_helper_rejects_spoofed_pdf_mime():
with pytest.raises(HTTPException) as exc:
run(read_pdf_upload_bytes(FakeUpload(b"not a pdf", "application/pdf")))
assert exc.value.status_code == 415
assert "not a valid PDF" in exc.value.detail