You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
274 lines
8.5 KiB
274 lines
8.5 KiB
"""文档处理模块路由。
|
|
|
|
提供文档上传、解析、格式修正和删除功能。
|
|
支持多种文档格式(文本、PDF、Word、Excel 等)的处理。
|
|
"""
|
|
import os
|
|
import uuid
|
|
import shutil
|
|
from datetime import datetime
|
|
from fastapi import APIRouter, Depends, HTTPException, Request, UploadFile, File
|
|
from sqlalchemy import select
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
from database import get_db
|
|
from models import AuditLog
|
|
from schemas import DocumentUploadOut, DocumentParseResult
|
|
from config import settings
|
|
from dependencies import get_current_user
|
|
|
|
router = APIRouter(prefix="/api/document", tags=["document"])
|
|
|
|
|
|
@router.post("/upload", response_model=DocumentUploadOut)
|
|
async def upload_document(
|
|
file: UploadFile = File(...),
|
|
request: Request = None,
|
|
user: dict = Depends(get_current_user),
|
|
):
|
|
"""上传文档文件到服务器。
|
|
|
|
检查文件大小限制后保存到上传目录。
|
|
|
|
Args:
|
|
file: 上传的文件对象。
|
|
request: HTTP 请求对象。
|
|
user: 当前登录用户信息。
|
|
|
|
Returns:
|
|
DocumentUploadOut: 包含文件 ID、文件名、大小等信息的响应。
|
|
|
|
Raises:
|
|
HTTPException: 文件大小超过限制时抛出异常。
|
|
"""
|
|
max_size = settings.MAX_UPLOAD_SIZE_MB * 1024 * 1024 # 最大允许上传大小(字节)
|
|
content = await file.read()
|
|
if len(content) > max_size:
|
|
raise HTTPException(400, f"文件大小超过限制 ({settings.MAX_UPLOAD_SIZE_MB}MB)")
|
|
|
|
file_id = uuid.uuid4()
|
|
os.makedirs(settings.UPLOAD_DIR, exist_ok=True)
|
|
|
|
ext = os.path.splitext(file.filename or "unknown")[1] # 获取文件扩展名
|
|
stored_name = f"{file_id}{ext}" # 使用 UUID 作为存储文件名
|
|
file_path = os.path.join(settings.UPLOAD_DIR, stored_name)
|
|
|
|
with open(file_path, "wb") as f:
|
|
f.write(content)
|
|
|
|
return DocumentUploadOut(
|
|
file_id=file_id,
|
|
filename=file.filename or "unknown",
|
|
file_size=len(content),
|
|
content_type=file.content_type or "application/octet-stream",
|
|
upload_time=datetime.utcnow(),
|
|
)
|
|
|
|
|
|
@router.post("/parse/{file_id}", response_model=DocumentParseResult)
|
|
async def parse_document(
|
|
file_id: uuid.UUID,
|
|
request: Request,
|
|
db: AsyncSession = Depends(get_db),
|
|
user: dict = Depends(get_current_user),
|
|
):
|
|
"""解析已上传的文档文件,提取文本内容。
|
|
|
|
根据文件扩展名选择合适的解析方式,支持纯文本、PDF、Word、Excel 等格式。
|
|
|
|
Args:
|
|
file_id: 文件唯一标识 ID。
|
|
request: HTTP 请求对象。
|
|
db: 异步数据库会话。
|
|
user: 当前登录用户信息。
|
|
|
|
Returns:
|
|
DocumentParseResult: 包含文件内容和元数据的解析结果。
|
|
|
|
Raises:
|
|
HTTPException: 文件不存在时抛出异常。
|
|
"""
|
|
ext_map = {".txt", ".md", ".py", ".js", ".ts", ".json", ".xml", ".yaml", ".yml", ".csv", ".html", ".css", ".java", ".go", ".rs"}
|
|
os.makedirs(settings.UPLOAD_DIR, exist_ok=True)
|
|
|
|
found_file = None
|
|
found_filename = ""
|
|
for fname in os.listdir(settings.UPLOAD_DIR):
|
|
if fname.startswith(str(file_id)):
|
|
found_file = os.path.join(settings.UPLOAD_DIR, fname)
|
|
found_filename = fname
|
|
break
|
|
|
|
if not found_file:
|
|
raise HTTPException(404, "文件不存在")
|
|
|
|
ext = os.path.splitext(found_filename)[1].lower() # 获取文件扩展名
|
|
content = ""
|
|
metadata = {"file_size": os.path.getsize(found_file), "extension": ext}
|
|
|
|
if ext in ext_map:
|
|
with open(found_file, "r", encoding="utf-8", errors="replace") as f:
|
|
content = f.read()
|
|
metadata["lines"] = len(content.splitlines())
|
|
metadata["chars"] = len(content)
|
|
elif ext == ".pdf":
|
|
content = f"[PDF文档解析] 文件: {found_filename}"
|
|
metadata["type"] = "pdf"
|
|
elif ext in {".doc", ".docx"}:
|
|
content = f"[Word文档解析] 文件: {found_filename}"
|
|
metadata["type"] = "word"
|
|
elif ext in {".xls", ".xlsx"}:
|
|
content = f"[Excel文档解析] 文件: {found_filename}"
|
|
metadata["type"] = "excel"
|
|
else:
|
|
content = f"[不支持的文件类型 .{ext}] 文件: {found_filename}"
|
|
metadata["type"] = "unsupported"
|
|
|
|
# 记录审计日志
|
|
audit = AuditLog(
|
|
operator_id=uuid.UUID(user["id"]),
|
|
action="document.parse",
|
|
resource="document",
|
|
resource_id=str(file_id),
|
|
detail={"filename": found_filename, "ext": ext},
|
|
ip_address=request.client.host if request.client else None,
|
|
)
|
|
db.add(audit)
|
|
await db.flush()
|
|
|
|
return DocumentParseResult(
|
|
file_id=file_id,
|
|
filename=found_filename,
|
|
content=content,
|
|
metadata=metadata,
|
|
)
|
|
|
|
|
|
@router.delete("/{file_id}")
|
|
async def delete_document(
|
|
file_id: uuid.UUID,
|
|
request: Request,
|
|
db: AsyncSession = Depends(get_db),
|
|
user: dict = Depends(get_current_user),
|
|
):
|
|
"""删除已上传的文档文件。
|
|
|
|
Args:
|
|
file_id: 文件唯一标识 ID。
|
|
request: HTTP 请求对象。
|
|
db: 异步数据库会话。
|
|
user: 当前登录用户信息。
|
|
|
|
Returns:
|
|
dict: 操作结果响应。
|
|
|
|
Raises:
|
|
HTTPException: 文件不存在时抛出异常。
|
|
"""
|
|
os.makedirs(settings.UPLOAD_DIR, exist_ok=True)
|
|
|
|
deleted = False
|
|
for fname in os.listdir(settings.UPLOAD_DIR):
|
|
if fname.startswith(str(file_id)):
|
|
os.remove(os.path.join(settings.UPLOAD_DIR, fname))
|
|
deleted = True
|
|
break
|
|
|
|
if not deleted:
|
|
raise HTTPException(404, "文件不存在")
|
|
|
|
# 记录审计日志
|
|
audit = AuditLog(
|
|
operator_id=uuid.UUID(user["id"]),
|
|
action="document.delete",
|
|
resource="document",
|
|
resource_id=str(file_id),
|
|
ip_address=request.client.host if request.client else None,
|
|
)
|
|
db.add(audit)
|
|
await db.flush()
|
|
|
|
return {"code": 200, "message": "已删除"}
|
|
|
|
|
|
@router.post("/format")
|
|
async def format_document(
|
|
payload: dict,
|
|
request: Request,
|
|
db: AsyncSession = Depends(get_db),
|
|
user: dict = Depends(get_current_user),
|
|
):
|
|
"""对文档内容进行格式修正。
|
|
|
|
支持 standard、markdown、json 三种格式类型。
|
|
|
|
Args:
|
|
payload: 请求体,包含 content 和 format_type 字段。
|
|
request: HTTP 请求对象。
|
|
db: 异步数据库会话。
|
|
user: 当前登录用户信息。
|
|
|
|
Returns:
|
|
dict: 包含格式化后内容的响应数据。
|
|
"""
|
|
content = payload.get("content", "")
|
|
format_type = payload.get("format_type", "standard")
|
|
|
|
result = _apply_formatting(content, format_type)
|
|
|
|
# 记录审计日志
|
|
audit = AuditLog(
|
|
operator_id=uuid.UUID(user["id"]),
|
|
action="document.format",
|
|
resource="document",
|
|
resource_id=format_type,
|
|
detail={"format_type": format_type, "original_length": len(content)},
|
|
ip_address=request.client.host if request.client else None,
|
|
)
|
|
db.add(audit)
|
|
await db.flush()
|
|
|
|
return {"code": 200, "data": {"formatted": result, "format_type": format_type}}
|
|
|
|
|
|
def _apply_formatting(content: str, format_type: str) -> str:
|
|
"""应用指定的格式规则对文本内容进行格式化。
|
|
|
|
Args:
|
|
content: 待格式化的原始文本内容。
|
|
format_type: 格式类型,支持 standard、markdown、json。
|
|
|
|
Returns:
|
|
str: 格式化后的文本内容。
|
|
"""
|
|
lines = content.splitlines()
|
|
result = []
|
|
|
|
if format_type == "standard":
|
|
for line in lines:
|
|
line = line.strip()
|
|
if line:
|
|
result.append(line)
|
|
return "\n\n".join(result)
|
|
|
|
elif format_type == "markdown":
|
|
result.append(f"# 格式化文档\n\n> 处理时间: {datetime.utcnow().isoformat()}\n")
|
|
for line in lines:
|
|
line = line.strip()
|
|
if line:
|
|
if line.startswith("#"):
|
|
result.append(line)
|
|
elif len(line) < 60 and line.endswith((".", "。", "?", "?", "!", "!")):
|
|
result.append(f"> {line}\n")
|
|
else:
|
|
result.append(line)
|
|
return "\n\n".join(result)
|
|
|
|
elif format_type == "json":
|
|
import json
|
|
try:
|
|
parsed = json.loads(content)
|
|
return json.dumps(parsed, ensure_ascii=False, indent=2)
|
|
except json.JSONDecodeError:
|
|
return json.dumps({"content": content, "lines": len(lines)}, ensure_ascii=False, indent=2)
|
|
|
|
return content
|
|
|