[verified] add upload size and MIME guards
Some checks failed
api-ci-deploy / test-build-deploy (push) Has been cancelled
Some checks failed
api-ci-deploy / test-build-deploy (push) Has been cancelled
(cherry picked from commit f5e05376f637f55b73e474cac8199529682ca398)
This commit is contained in:
parent
e98fed661f
commit
c69451fba2
99
modules/upload_validation.py
Normal file
99
modules/upload_validation.py
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
"""Upload boundary validation shared by file-upload endpoints.
|
||||||
|
|
||||||
|
E3 hardening: keep user-facing upload routes from buffering arbitrary data and
|
||||||
|
from accepting arbitrary MIME/types into Supabase storage.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
from typing import Iterable, Optional
|
||||||
|
|
||||||
|
from fastapi import HTTPException, UploadFile
|
||||||
|
|
||||||
|
# Conservative defaults: Classroom Copilot uploads are user documents/images.
|
||||||
|
# Exam scan uploads already have their own 50 MB PDF-only guard in routers.exam.batches.
|
||||||
|
MAX_UPLOAD_BYTES = int(os.getenv("CC_UPLOAD_MAX_BYTES", str(25 * 1024 * 1024)))
|
||||||
|
UPLOAD_CHUNK_BYTES = 1024 * 1024
|
||||||
|
|
||||||
|
ALLOWED_UPLOAD_MIME_TYPES = frozenset(
|
||||||
|
mt.strip().lower()
|
||||||
|
for mt in os.getenv(
|
||||||
|
"CC_UPLOAD_ALLOWED_MIME_TYPES",
|
||||||
|
",".join(
|
||||||
|
[
|
||||||
|
"application/pdf",
|
||||||
|
"image/png",
|
||||||
|
"image/jpeg",
|
||||||
|
"image/webp",
|
||||||
|
"image/gif",
|
||||||
|
"text/plain",
|
||||||
|
"text/csv",
|
||||||
|
"text/markdown",
|
||||||
|
"application/msword",
|
||||||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
|
"application/vnd.ms-powerpoint",
|
||||||
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||||
|
"application/vnd.ms-excel",
|
||||||
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||||
|
]
|
||||||
|
),
|
||||||
|
).split(",")
|
||||||
|
if mt.strip()
|
||||||
|
)
|
||||||
|
|
||||||
|
_PDF_MIME_TYPES = {"application/pdf", "application/x-pdf"}
|
||||||
|
|
||||||
|
|
||||||
|
def allowed_upload_mime_types_csv() -> str:
|
||||||
|
"""Stable display string for evidence/errors without leaking config internals."""
|
||||||
|
return ", ".join(sorted(ALLOWED_UPLOAD_MIME_TYPES))
|
||||||
|
|
||||||
|
|
||||||
|
def _declared_mime(upload: UploadFile) -> str:
|
||||||
|
return (upload.content_type or "application/octet-stream").split(";", 1)[0].strip().lower()
|
||||||
|
|
||||||
|
|
||||||
|
def validate_upload_mime(upload: UploadFile, *, allowed_mime_types: Optional[Iterable[str]] = None) -> str:
|
||||||
|
"""Validate client-declared upload MIME/type and return its normalised value."""
|
||||||
|
declared = _declared_mime(upload)
|
||||||
|
allowed = {mt.lower() for mt in (allowed_mime_types or ALLOWED_UPLOAD_MIME_TYPES)}
|
||||||
|
if declared not in allowed:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=415,
|
||||||
|
detail=(
|
||||||
|
f"Unsupported upload type '{declared}'. Allowed MIME types: "
|
||||||
|
f"{', '.join(sorted(allowed))}"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
return declared
|
||||||
|
|
||||||
|
|
||||||
|
async def read_upload_bytes(
|
||||||
|
upload: UploadFile,
|
||||||
|
*,
|
||||||
|
max_bytes: int = MAX_UPLOAD_BYTES,
|
||||||
|
allowed_mime_types: Optional[Iterable[str]] = None,
|
||||||
|
) -> tuple[bytes, str]:
|
||||||
|
"""Validate MIME and read an UploadFile with a hard size ceiling."""
|
||||||
|
mime_type = validate_upload_mime(upload, allowed_mime_types=allowed_mime_types)
|
||||||
|
chunks: list[bytes] = []
|
||||||
|
total = 0
|
||||||
|
while True:
|
||||||
|
chunk = await upload.read(UPLOAD_CHUNK_BYTES)
|
||||||
|
if not chunk:
|
||||||
|
break
|
||||||
|
total += len(chunk)
|
||||||
|
if total > max_bytes:
|
||||||
|
raise HTTPException(status_code=413, detail=f"Upload exceeds max size ({max_bytes} bytes)")
|
||||||
|
chunks.append(chunk)
|
||||||
|
return b"".join(chunks), mime_type
|
||||||
|
|
||||||
|
|
||||||
|
async def read_pdf_upload_bytes(upload: UploadFile, *, max_bytes: int = MAX_UPLOAD_BYTES) -> bytes:
|
||||||
|
"""Read a PDF-only upload with size and lightweight magic-header validation."""
|
||||||
|
data, _mime_type = await read_upload_bytes(upload, max_bytes=max_bytes, allowed_mime_types=_PDF_MIME_TYPES)
|
||||||
|
if not data:
|
||||||
|
raise HTTPException(status_code=400, detail="Uploaded PDF is empty")
|
||||||
|
if not data.startswith(b"%PDF-"):
|
||||||
|
raise HTTPException(status_code=415, detail="Uploaded file is not a valid PDF")
|
||||||
|
return data
|
||||||
@ -12,6 +12,7 @@ from modules.auth.supabase_bearer import SupabaseBearer, verify_supabase_jwt_str
|
|||||||
from modules.logger_tool import initialise_logger
|
from modules.logger_tool import initialise_logger
|
||||||
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
|
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
|
||||||
from modules.database.supabase.utils.storage import StorageAdmin
|
from modules.database.supabase.utils.storage import StorageAdmin
|
||||||
|
from modules.upload_validation import read_upload_bytes
|
||||||
from modules.document_processor import DocumentProcessor
|
from modules.document_processor import DocumentProcessor
|
||||||
from modules.queue_system import (
|
from modules.queue_system import (
|
||||||
enqueue_tika_task, enqueue_docling_task, enqueue_split_map_task,
|
enqueue_tika_task, enqueue_docling_task, enqueue_split_map_task,
|
||||||
@ -88,13 +89,13 @@ async def upload_file(
|
|||||||
# Stage DB row to get file_id
|
# Stage DB row to get file_id
|
||||||
staged_path = f"{cabinet_id}/staging/{uuid.uuid4()}"
|
staged_path = f"{cabinet_id}/staging/{uuid.uuid4()}"
|
||||||
name = _safe_filename(path or file.filename)
|
name = _safe_filename(path or file.filename)
|
||||||
file_bytes = await file.read()
|
file_bytes, mime_type = await read_upload_bytes(file)
|
||||||
insert_res = client.supabase.table('files').insert({
|
insert_res = client.supabase.table('files').insert({
|
||||||
'cabinet_id': cabinet_id,
|
'cabinet_id': cabinet_id,
|
||||||
'name': name,
|
'name': name,
|
||||||
'path': staged_path,
|
'path': staged_path,
|
||||||
'bucket': bucket,
|
'bucket': bucket,
|
||||||
'mime_type': file.content_type,
|
'mime_type': mime_type,
|
||||||
'uploaded_by': user_id,
|
'uploaded_by': user_id,
|
||||||
'size_bytes': len(file_bytes),
|
'size_bytes': len(file_bytes),
|
||||||
'source': 'classroomcopilot-web'
|
'source': 'classroomcopilot-web'
|
||||||
@ -107,7 +108,7 @@ async def upload_file(
|
|||||||
# Final storage path: bucket/cabinet_id/file_id/file
|
# Final storage path: bucket/cabinet_id/file_id/file
|
||||||
final_storage_path = f"{cabinet_id}/{file_id}/{name}"
|
final_storage_path = f"{cabinet_id}/{file_id}/{name}"
|
||||||
try:
|
try:
|
||||||
storage.upload_file(bucket, final_storage_path, file_bytes, file.content_type or 'application/octet-stream', upsert=True)
|
storage.upload_file(bucket, final_storage_path, file_bytes, mime_type, upsert=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# cleanup staged row
|
# cleanup staged row
|
||||||
client.supabase.table('files').delete().eq('id', file_id).execute()
|
client.supabase.table('files').delete().eq('id', file_id).execute()
|
||||||
|
|||||||
@ -19,6 +19,7 @@ from fastapi.responses import JSONResponse
|
|||||||
from modules.auth.supabase_bearer import SupabaseBearer
|
from modules.auth.supabase_bearer import SupabaseBearer
|
||||||
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
|
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
|
||||||
from modules.database.supabase.utils.storage import StorageAdmin
|
from modules.database.supabase.utils.storage import StorageAdmin
|
||||||
|
from modules.upload_validation import read_upload_bytes
|
||||||
from modules.logger_tool import initialise_logger
|
from modules.logger_tool import initialise_logger
|
||||||
|
|
||||||
router = APIRouter()
|
router = APIRouter()
|
||||||
@ -72,10 +73,9 @@ async def upload_file(
|
|||||||
if not user_id:
|
if not user_id:
|
||||||
raise HTTPException(status_code=401, detail="User ID required")
|
raise HTTPException(status_code=401, detail="User ID required")
|
||||||
|
|
||||||
# Read file content
|
# Validate MIME/type and read file content with a hard size limit.
|
||||||
file_bytes = await file.read()
|
file_bytes, mime_type = await read_upload_bytes(file)
|
||||||
file_size = len(file_bytes)
|
file_size = len(file_bytes)
|
||||||
mime_type = file.content_type or 'application/octet-stream'
|
|
||||||
filename = file.filename or path
|
filename = file.filename or path
|
||||||
|
|
||||||
logger.info(f"📤 Simplified upload: {filename} ({file_size} bytes) for user {user_id}")
|
logger.info(f"📤 Simplified upload: {filename} ({file_size} bytes) for user {user_id}")
|
||||||
|
|||||||
@ -28,6 +28,7 @@ from api.services.docling.regions import detect_response_regions_from_pdf
|
|||||||
from modules.database.services.exam_projection import project_template, project_template_safe
|
from modules.database.services.exam_projection import project_template, project_template_safe
|
||||||
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
|
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
|
||||||
from modules.database.supabase.utils.storage import StorageAdmin
|
from modules.database.supabase.utils.storage import StorageAdmin
|
||||||
|
from modules.upload_validation import read_pdf_upload_bytes
|
||||||
from modules.logger_tool import initialise_logger
|
from modules.logger_tool import initialise_logger
|
||||||
from routers.exam.dependencies import ExamContext, get_exam_context, lookup_exam_code
|
from routers.exam.dependencies import ExamContext, get_exam_context, lookup_exam_code
|
||||||
from routers.exam.schemas import (
|
from routers.exam.schemas import (
|
||||||
@ -164,11 +165,7 @@ async def _upload_template_source_file(
|
|||||||
institute_id: str,
|
institute_id: str,
|
||||||
upload: UploadFile,
|
upload: UploadFile,
|
||||||
) -> str:
|
) -> str:
|
||||||
file_bytes = await upload.read()
|
file_bytes = await read_pdf_upload_bytes(upload)
|
||||||
if not file_bytes:
|
|
||||||
raise HTTPException(status_code=400, detail="Uploaded PDF is empty")
|
|
||||||
if upload.content_type and upload.content_type != "application/pdf":
|
|
||||||
raise HTTPException(status_code=400, detail="Uploaded file must be a PDF")
|
|
||||||
|
|
||||||
service = SupabaseServiceRoleClient()
|
service = SupabaseServiceRoleClient()
|
||||||
storage = StorageAdmin()
|
storage = StorageAdmin()
|
||||||
|
|||||||
@ -26,6 +26,7 @@ from fastapi.responses import JSONResponse
|
|||||||
from modules.auth.supabase_bearer import SupabaseBearer
|
from modules.auth.supabase_bearer import SupabaseBearer
|
||||||
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
|
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
|
||||||
from modules.database.supabase.utils.storage import StorageAdmin
|
from modules.database.supabase.utils.storage import StorageAdmin
|
||||||
|
from modules.upload_validation import read_upload_bytes
|
||||||
from modules.logger_tool import initialise_logger
|
from modules.logger_tool import initialise_logger
|
||||||
|
|
||||||
router = APIRouter()
|
router = APIRouter()
|
||||||
@ -59,10 +60,9 @@ async def upload_single_file(
|
|||||||
if not user_id:
|
if not user_id:
|
||||||
raise HTTPException(status_code=401, detail="User ID required")
|
raise HTTPException(status_code=401, detail="User ID required")
|
||||||
|
|
||||||
# Read file content
|
# Validate MIME/type and read file content with a hard size limit.
|
||||||
file_bytes = await file.read()
|
file_bytes, mime_type = await read_upload_bytes(file)
|
||||||
file_size = len(file_bytes)
|
file_size = len(file_bytes)
|
||||||
mime_type = file.content_type or 'application/octet-stream'
|
|
||||||
filename = file.filename or path
|
filename = file.filename or path
|
||||||
|
|
||||||
logger.info(f"📤 Simple upload: {filename} ({file_size} bytes) for user {user_id}")
|
logger.info(f"📤 Simple upload: {filename} ({file_size} bytes) for user {user_id}")
|
||||||
@ -234,10 +234,9 @@ async def upload_directory(
|
|||||||
# Process each file
|
# Process each file
|
||||||
for i, (file, relative_path) in enumerate(zip(files, relative_paths)):
|
for i, (file, relative_path) in enumerate(zip(files, relative_paths)):
|
||||||
try:
|
try:
|
||||||
# Read file content
|
# Validate MIME/type and read file content with a hard size limit.
|
||||||
file_bytes = await file.read()
|
file_bytes, mime_type = await read_upload_bytes(file)
|
||||||
file_size = len(file_bytes)
|
file_size = len(file_bytes)
|
||||||
mime_type = file.content_type or 'application/octet-stream'
|
|
||||||
filename = file.filename or f"file_{i}"
|
filename = file.filename or f"file_{i}"
|
||||||
|
|
||||||
total_size += file_size
|
total_size += file_size
|
||||||
@ -291,6 +290,8 @@ async def upload_directory(
|
|||||||
|
|
||||||
logger.info(f"📄 Uploaded file {i+1}/{len(files)}: {relative_path}")
|
logger.info(f"📄 Uploaded file {i+1}/{len(files)}: {relative_path}")
|
||||||
|
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to upload file {relative_path}: {e}")
|
logger.error(f"Failed to upload file {relative_path}: {e}")
|
||||||
# Continue with other files, don't fail entire upload
|
# Continue with other files, don't fail entire upload
|
||||||
|
|||||||
54
tests/test_upload_validation.py
Normal file
54
tests/test_upload_validation.py
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
import asyncio
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from fastapi import HTTPException
|
||||||
|
|
||||||
|
from modules.upload_validation import MAX_UPLOAD_BYTES, read_pdf_upload_bytes, read_upload_bytes
|
||||||
|
|
||||||
|
|
||||||
|
class FakeUpload:
|
||||||
|
def __init__(self, data: bytes, content_type: str, filename: str = "file.bin"):
|
||||||
|
self._data = data
|
||||||
|
self._pos = 0
|
||||||
|
self.content_type = content_type
|
||||||
|
self.filename = filename
|
||||||
|
|
||||||
|
async def read(self, size: int = -1) -> bytes:
|
||||||
|
if self._pos >= len(self._data):
|
||||||
|
return b""
|
||||||
|
if size is None or size < 0:
|
||||||
|
size = len(self._data) - self._pos
|
||||||
|
chunk = self._data[self._pos : self._pos + size]
|
||||||
|
self._pos += len(chunk)
|
||||||
|
return chunk
|
||||||
|
|
||||||
|
|
||||||
|
def run(coro):
|
||||||
|
return asyncio.run(coro)
|
||||||
|
|
||||||
|
|
||||||
|
def test_valid_pdf_upload_passes_and_returns_mime():
|
||||||
|
data, mime = run(read_upload_bytes(FakeUpload(b"%PDF-1.7\n", "application/pdf")))
|
||||||
|
assert data.startswith(b"%PDF-")
|
||||||
|
assert mime == "application/pdf"
|
||||||
|
|
||||||
|
|
||||||
|
def test_disallowed_mime_rejected_with_415():
|
||||||
|
with pytest.raises(HTTPException) as exc:
|
||||||
|
run(read_upload_bytes(FakeUpload(b"print(1)", "application/x-python")))
|
||||||
|
assert exc.value.status_code == 415
|
||||||
|
assert "Unsupported upload type" in exc.value.detail
|
||||||
|
|
||||||
|
|
||||||
|
def test_oversize_upload_rejected_with_413():
|
||||||
|
with pytest.raises(HTTPException) as exc:
|
||||||
|
run(read_upload_bytes(FakeUpload(b"x" * (MAX_UPLOAD_BYTES + 1), "text/plain")))
|
||||||
|
assert exc.value.status_code == 413
|
||||||
|
assert "exceeds max size" in exc.value.detail
|
||||||
|
|
||||||
|
|
||||||
|
def test_pdf_helper_rejects_spoofed_pdf_mime():
|
||||||
|
with pytest.raises(HTTPException) as exc:
|
||||||
|
run(read_pdf_upload_bytes(FakeUpload(b"not a pdf", "application/pdf")))
|
||||||
|
assert exc.value.status_code == 415
|
||||||
|
assert "not a valid PDF" in exc.value.detail
|
||||||
Loading…
x
Reference in New Issue
Block a user