You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

199 lines
6.2 KiB

import os
import uuid
import shutil
from datetime import datetime
from fastapi import APIRouter, Depends, HTTPException, Request, UploadFile, File
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from database import get_db
from models import AuditLog
from schemas import DocumentUploadOut, DocumentParseResult
from config import settings
from dependencies import get_current_user
router = APIRouter(prefix="/api/document", tags=["document"])
@router.post("/upload", response_model=DocumentUploadOut)
async def upload_document(
file: UploadFile = File(...),
request: Request = None,
user: dict = Depends(get_current_user),
):
max_size = settings.MAX_UPLOAD_SIZE_MB * 1024 * 1024
content = await file.read()
if len(content) > max_size:
raise HTTPException(400, f"文件大小超过限制 ({settings.MAX_UPLOAD_SIZE_MB}MB)")
file_id = uuid.uuid4()
os.makedirs(settings.UPLOAD_DIR, exist_ok=True)
ext = os.path.splitext(file.filename or "unknown")[1]
stored_name = f"{file_id}{ext}"
file_path = os.path.join(settings.UPLOAD_DIR, stored_name)
with open(file_path, "wb") as f:
f.write(content)
return DocumentUploadOut(
file_id=file_id,
filename=file.filename or "unknown",
file_size=len(content),
content_type=file.content_type or "application/octet-stream",
upload_time=datetime.utcnow(),
)
@router.post("/parse/{file_id}", response_model=DocumentParseResult)
async def parse_document(
file_id: uuid.UUID,
request: Request,
db: AsyncSession = Depends(get_db),
user: dict = Depends(get_current_user),
):
ext_map = {".txt", ".md", ".py", ".js", ".ts", ".json", ".xml", ".yaml", ".yml", ".csv", ".html", ".css", ".java", ".go", ".rs"}
os.makedirs(settings.UPLOAD_DIR, exist_ok=True)
found_file = None
found_filename = ""
for fname in os.listdir(settings.UPLOAD_DIR):
if fname.startswith(str(file_id)):
found_file = os.path.join(settings.UPLOAD_DIR, fname)
found_filename = fname
break
if not found_file:
raise HTTPException(404, "文件不存在")
ext = os.path.splitext(found_filename)[1].lower()
content = ""
metadata = {"file_size": os.path.getsize(found_file), "extension": ext}
if ext in ext_map:
with open(found_file, "r", encoding="utf-8", errors="replace") as f:
content = f.read()
metadata["lines"] = len(content.splitlines())
metadata["chars"] = len(content)
elif ext == ".pdf":
content = f"[PDF文档解析] 文件: {found_filename}"
metadata["type"] = "pdf"
elif ext in {".doc", ".docx"}:
content = f"[Word文档解析] 文件: {found_filename}"
metadata["type"] = "word"
elif ext in {".xls", ".xlsx"}:
content = f"[Excel文档解析] 文件: {found_filename}"
metadata["type"] = "excel"
else:
content = f"[不支持的文件类型 .{ext}] 文件: {found_filename}"
metadata["type"] = "unsupported"
audit = AuditLog(
operator_id=uuid.UUID(user["id"]),
action="document.parse",
resource="document",
resource_id=str(file_id),
detail={"filename": found_filename, "ext": ext},
ip_address=request.client.host if request.client else None,
)
db.add(audit)
await db.flush()
return DocumentParseResult(
file_id=file_id,
filename=found_filename,
content=content,
metadata=metadata,
)
@router.delete("/{file_id}")
async def delete_document(
file_id: uuid.UUID,
request: Request,
db: AsyncSession = Depends(get_db),
user: dict = Depends(get_current_user),
):
os.makedirs(settings.UPLOAD_DIR, exist_ok=True)
deleted = False
for fname in os.listdir(settings.UPLOAD_DIR):
if fname.startswith(str(file_id)):
os.remove(os.path.join(settings.UPLOAD_DIR, fname))
deleted = True
break
if not deleted:
raise HTTPException(404, "文件不存在")
audit = AuditLog(
operator_id=uuid.UUID(user["id"]),
action="document.delete",
resource="document",
resource_id=str(file_id),
ip_address=request.client.host if request.client else None,
)
db.add(audit)
await db.flush()
return {"code": 200, "message": "已删除"}
@router.post("/format")
async def format_document(
payload: dict,
request: Request,
db: AsyncSession = Depends(get_db),
user: dict = Depends(get_current_user),
):
content = payload.get("content", "")
format_type = payload.get("format_type", "standard")
result = _apply_formatting(content, format_type)
audit = AuditLog(
operator_id=uuid.UUID(user["id"]),
action="document.format",
resource="document",
resource_id=format_type,
detail={"format_type": format_type, "original_length": len(content)},
ip_address=request.client.host if request.client else None,
)
db.add(audit)
await db.flush()
return {"code": 200, "data": {"formatted": result, "format_type": format_type}}
def _apply_formatting(content: str, format_type: str) -> str:
lines = content.splitlines()
result = []
if format_type == "standard":
for line in lines:
line = line.strip()
if line:
result.append(line)
return "\n\n".join(result)
elif format_type == "markdown":
result.append(f"# 格式化文档\n\n> 处理时间: {datetime.utcnow().isoformat()}\n")
for line in lines:
line = line.strip()
if line:
if line.startswith("#"):
result.append(line)
elif len(line) < 60 and line.endswith((".", "", "?", "", "!", "")):
result.append(f"> {line}\n")
else:
result.append(line)
return "\n\n".join(result)
elif format_type == "json":
import json
try:
parsed = json.loads(content)
return json.dumps(parsed, ensure_ascii=False, indent=2)
except json.JSONDecodeError:
return json.dumps({"content": content, "lines": len(lines)}, ensure_ascii=False, indent=2)
return content