hg-agents/backend/modules/document/router.py


								"""文档处理模块路由。


								提供文档上传、解析、格式修正和删除功能。

								支持多种文档格式（文本、PDF、Word、Excel 等）的处理。

								"""

								import os

								import uuid

								import shutil

								from datetime import datetime

								from fastapi import APIRouter, Depends, HTTPException, Request, UploadFile, File

								from sqlalchemy import select

								from sqlalchemy.ext.asyncio import AsyncSession

								from database import get_db

								from models import AuditLog

								from schemas import DocumentUploadOut, DocumentParseResult

								from config import settings

								from dependencies import get_current_user


								router = APIRouter(prefix="/api/document", tags=["document"])


								@router.post("/upload", response_model=DocumentUploadOut)

								async def upload_document(

								    file: UploadFile = File(...),

								    request: Request = None,

								    user: dict = Depends(get_current_user),

								):

								    """上传文档文件到服务器。


								    检查文件大小限制后保存到上传目录。


								    Args:

								        file: 上传的文件对象。

								        request: HTTP 请求对象。

								        user: 当前登录用户信息。


								    Returns:

								        DocumentUploadOut: 包含文件 ID、文件名、大小等信息的响应。


								    Raises:

								        HTTPException: 文件大小超过限制时抛出异常。

								    """

								    max_size = settings.MAX_UPLOAD_SIZE_MB * 1024 * 1024  # 最大允许上传大小（字节）

								    content = await file.read()

								    if len(content) > max_size:

								        raise HTTPException(400, f"文件大小超过限制 ({settings.MAX_UPLOAD_SIZE_MB}MB)")


								    file_id = uuid.uuid4()

								    os.makedirs(settings.UPLOAD_DIR, exist_ok=True)


								    ext = os.path.splitext(file.filename or "unknown")[1]  # 获取文件扩展名

								    stored_name = f"{file_id}{ext}"  # 使用 UUID 作为存储文件名

								    file_path = os.path.join(settings.UPLOAD_DIR, stored_name)


								    with open(file_path, "wb") as f:

								        f.write(content)


								    return DocumentUploadOut(

								        file_id=file_id,

								        filename=file.filename or "unknown",

								        file_size=len(content),

								        content_type=file.content_type or "application/octet-stream",

								        upload_time=datetime.utcnow(),

								    )


								@router.post("/parse/{file_id}", response_model=DocumentParseResult)

								async def parse_document(

								    file_id: uuid.UUID,

								    request: Request,

								    db: AsyncSession = Depends(get_db),

								    user: dict = Depends(get_current_user),

								):

								    """解析已上传的文档文件，提取文本内容。


								    根据文件扩展名选择合适的解析方式，支持纯文本、PDF、Word、Excel 等格式。


								    Args:

								        file_id: 文件唯一标识 ID。

								        request: HTTP 请求对象。

								        db: 异步数据库会话。

								        user: 当前登录用户信息。


								    Returns:

								        DocumentParseResult: 包含文件内容和元数据的解析结果。


								    Raises:

								        HTTPException: 文件不存在时抛出异常。

								    """

								    ext_map = {".txt", ".md", ".py", ".js", ".ts", ".json", ".xml", ".yaml", ".yml", ".csv", ".html", ".css", ".java", ".go", ".rs"}

								    os.makedirs(settings.UPLOAD_DIR, exist_ok=True)


								    found_file = None

								    found_filename = ""

								    for fname in os.listdir(settings.UPLOAD_DIR):

								        if fname.startswith(str(file_id)):

								            found_file = os.path.join(settings.UPLOAD_DIR, fname)

								            found_filename = fname

								            break


								    if not found_file:

								        raise HTTPException(404, "文件不存在")


								    ext = os.path.splitext(found_filename)[1].lower()  # 获取文件扩展名

								    content = ""

								    metadata = {"file_size": os.path.getsize(found_file), "extension": ext}


								    if ext in ext_map:

								        with open(found_file, "r", encoding="utf-8", errors="replace") as f:

								            content = f.read()

								        metadata["lines"] = len(content.splitlines())

								        metadata["chars"] = len(content)

								    elif ext == ".pdf":

								        content = f"[PDF文档解析] 文件: {found_filename}"

								        metadata["type"] = "pdf"

								    elif ext in {".doc", ".docx"}:

								        content = f"[Word文档解析] 文件: {found_filename}"

								        metadata["type"] = "word"

								    elif ext in {".xls", ".xlsx"}:

								        content = f"[Excel文档解析] 文件: {found_filename}"

								        metadata["type"] = "excel"

								    else:

								        content = f"[不支持的文件类型 .{ext}] 文件: {found_filename}"

								        metadata["type"] = "unsupported"


								    # 记录审计日志

								    audit = AuditLog(

								        operator_id=uuid.UUID(user["id"]),

								        action="document.parse",

								        resource="document",

								        resource_id=str(file_id),

								        detail={"filename": found_filename, "ext": ext},

								        ip_address=request.client.host if request.client else None,

								    )

								    db.add(audit)

								    await db.flush()


								    return DocumentParseResult(

								        file_id=file_id,

								        filename=found_filename,

								        content=content,

								        metadata=metadata,

								    )


								@router.delete("/{file_id}")

								async def delete_document(

								    file_id: uuid.UUID,

								    request: Request,

								    db: AsyncSession = Depends(get_db),

								    user: dict = Depends(get_current_user),

								):

								    """删除已上传的文档文件。


								    Args:

								        file_id: 文件唯一标识 ID。

								        request: HTTP 请求对象。

								        db: 异步数据库会话。

								        user: 当前登录用户信息。


								    Returns:

								        dict: 操作结果响应。


								    Raises:

								        HTTPException: 文件不存在时抛出异常。

								    """

								    os.makedirs(settings.UPLOAD_DIR, exist_ok=True)


								    deleted = False

								    for fname in os.listdir(settings.UPLOAD_DIR):

								        if fname.startswith(str(file_id)):

								            os.remove(os.path.join(settings.UPLOAD_DIR, fname))

								            deleted = True

								            break


								    if not deleted:

								        raise HTTPException(404, "文件不存在")


								    # 记录审计日志

								    audit = AuditLog(

								        operator_id=uuid.UUID(user["id"]),

								        action="document.delete",

								        resource="document",

								        resource_id=str(file_id),

								        ip_address=request.client.host if request.client else None,

								    )

								    db.add(audit)

								    await db.flush()


								    return {"code": 200, "message": "已删除"}


								@router.post("/format")

								async def format_document(

								    payload: dict,

								    request: Request,

								    db: AsyncSession = Depends(get_db),

								    user: dict = Depends(get_current_user),

								):

								    """对文档内容进行格式修正。


								    支持 standard、markdown、json 三种格式类型。


								    Args:

								        payload: 请求体，包含 content 和 format_type 字段。

								        request: HTTP 请求对象。

								        db: 异步数据库会话。

								        user: 当前登录用户信息。


								    Returns:

								        dict: 包含格式化后内容的响应数据。

								    """

								    content = payload.get("content", "")

								    format_type = payload.get("format_type", "standard")


								    result = _apply_formatting(content, format_type)


								    # 记录审计日志

								    audit = AuditLog(

								        operator_id=uuid.UUID(user["id"]),

								        action="document.format",

								        resource="document",

								        resource_id=format_type,

								        detail={"format_type": format_type, "original_length": len(content)},

								        ip_address=request.client.host if request.client else None,

								    )

								    db.add(audit)

								    await db.flush()


								    return {"code": 200, "data": {"formatted": result, "format_type": format_type}}


								def _apply_formatting(content: str, format_type: str) -> str:

								    """应用指定的格式规则对文本内容进行格式化。


								    Args:

								        content: 待格式化的原始文本内容。

								        format_type: 格式类型，支持 standard、markdown、json。


								    Returns:

								        str: 格式化后的文本内容。

								    """

								    lines = content.splitlines()

								    result = []


								    if format_type == "standard":

								        for line in lines:

								            line = line.strip()

								            if line:

								                result.append(line)

								        return "\n\n".join(result)


								    elif format_type == "markdown":

								        result.append(f"# 格式化文档\n\n> 处理时间: {datetime.utcnow().isoformat()}\n")

								        for line in lines:

								            line = line.strip()

								            if line:

								                if line.startswith("#"):

								                    result.append(line)

								                elif len(line) < 60 and line.endswith((".", "。", "?", "？", "!", "！")):

								                    result.append(f"> {line}\n")

								                else:

								                    result.append(line)

								        return "\n\n".join(result)


								    elif format_type == "json":

								        import json

								        try:

								            parsed = json.loads(content)

								            return json.dumps(parsed, ensure_ascii=False, indent=2)

								        except json.JSONDecodeError:

								            return json.dumps({"content": content, "lines": len(lines)}, ensure_ascii=False, indent=2)


								    return content