import os import uuid import shutil from datetime import datetime from fastapi import APIRouter, Depends, HTTPException, Request, UploadFile, File from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession from database import get_db from models import AuditLog from schemas import DocumentUploadOut, DocumentParseResult from config import settings from dependencies import get_current_user router = APIRouter(prefix="/api/document", tags=["document"]) @router.post("/upload", response_model=DocumentUploadOut) async def upload_document( file: UploadFile = File(...), request: Request = None, user: dict = Depends(get_current_user), ): max_size = settings.MAX_UPLOAD_SIZE_MB * 1024 * 1024 content = await file.read() if len(content) > max_size: raise HTTPException(400, f"文件大小超过限制 ({settings.MAX_UPLOAD_SIZE_MB}MB)") file_id = uuid.uuid4() os.makedirs(settings.UPLOAD_DIR, exist_ok=True) ext = os.path.splitext(file.filename or "unknown")[1] stored_name = f"{file_id}{ext}" file_path = os.path.join(settings.UPLOAD_DIR, stored_name) with open(file_path, "wb") as f: f.write(content) return DocumentUploadOut( file_id=file_id, filename=file.filename or "unknown", file_size=len(content), content_type=file.content_type or "application/octet-stream", upload_time=datetime.utcnow(), ) @router.post("/parse/{file_id}", response_model=DocumentParseResult) async def parse_document( file_id: uuid.UUID, request: Request, db: AsyncSession = Depends(get_db), user: dict = Depends(get_current_user), ): ext_map = {".txt", ".md", ".py", ".js", ".ts", ".json", ".xml", ".yaml", ".yml", ".csv", ".html", ".css", ".java", ".go", ".rs"} os.makedirs(settings.UPLOAD_DIR, exist_ok=True) found_file = None found_filename = "" for fname in os.listdir(settings.UPLOAD_DIR): if fname.startswith(str(file_id)): found_file = os.path.join(settings.UPLOAD_DIR, fname) found_filename = fname break if not found_file: raise HTTPException(404, "文件不存在") ext = os.path.splitext(found_filename)[1].lower() content = "" metadata = {"file_size": os.path.getsize(found_file), "extension": ext} if ext in ext_map: with open(found_file, "r", encoding="utf-8", errors="replace") as f: content = f.read() metadata["lines"] = len(content.splitlines()) metadata["chars"] = len(content) elif ext == ".pdf": content = f"[PDF文档解析] 文件: {found_filename}" metadata["type"] = "pdf" elif ext in {".doc", ".docx"}: content = f"[Word文档解析] 文件: {found_filename}" metadata["type"] = "word" elif ext in {".xls", ".xlsx"}: content = f"[Excel文档解析] 文件: {found_filename}" metadata["type"] = "excel" else: content = f"[不支持的文件类型 .{ext}] 文件: {found_filename}" metadata["type"] = "unsupported" audit = AuditLog( operator_id=uuid.UUID(user["id"]), action="document.parse", resource="document", resource_id=str(file_id), detail={"filename": found_filename, "ext": ext}, ip_address=request.client.host if request.client else None, ) db.add(audit) await db.flush() return DocumentParseResult( file_id=file_id, filename=found_filename, content=content, metadata=metadata, ) @router.delete("/{file_id}") async def delete_document( file_id: uuid.UUID, request: Request, db: AsyncSession = Depends(get_db), user: dict = Depends(get_current_user), ): os.makedirs(settings.UPLOAD_DIR, exist_ok=True) deleted = False for fname in os.listdir(settings.UPLOAD_DIR): if fname.startswith(str(file_id)): os.remove(os.path.join(settings.UPLOAD_DIR, fname)) deleted = True break if not deleted: raise HTTPException(404, "文件不存在") audit = AuditLog( operator_id=uuid.UUID(user["id"]), action="document.delete", resource="document", resource_id=str(file_id), ip_address=request.client.host if request.client else None, ) db.add(audit) await db.flush() return {"code": 200, "message": "已删除"} @router.post("/format") async def format_document( payload: dict, request: Request, db: AsyncSession = Depends(get_db), user: dict = Depends(get_current_user), ): content = payload.get("content", "") format_type = payload.get("format_type", "standard") result = _apply_formatting(content, format_type) audit = AuditLog( operator_id=uuid.UUID(user["id"]), action="document.format", resource="document", resource_id=format_type, detail={"format_type": format_type, "original_length": len(content)}, ip_address=request.client.host if request.client else None, ) db.add(audit) await db.flush() return {"code": 200, "data": {"formatted": result, "format_type": format_type}} def _apply_formatting(content: str, format_type: str) -> str: lines = content.splitlines() result = [] if format_type == "standard": for line in lines: line = line.strip() if line: result.append(line) return "\n\n".join(result) elif format_type == "markdown": result.append(f"# 格式化文档\n\n> 处理时间: {datetime.utcnow().isoformat()}\n") for line in lines: line = line.strip() if line: if line.startswith("#"): result.append(line) elif len(line) < 60 and line.endswith((".", "。", "?", "?", "!", "!")): result.append(f"> {line}\n") else: result.append(line) return "\n\n".join(result) elif format_type == "json": import json try: parsed = json.loads(content) return json.dumps(parsed, ensure_ascii=False, indent=2) except json.JSONDecodeError: return json.dumps({"content": content, "lines": len(lines)}, ensure_ascii=False, indent=2) return content