hg-agents/backend/modules/document/router.py


								import os

								import uuid

								import shutil

								from datetime import datetime

								from fastapi import APIRouter, Depends, HTTPException, Request, UploadFile, File

								from sqlalchemy import select

								from sqlalchemy.ext.asyncio import AsyncSession

								from database import get_db

								from models import AuditLog

								from schemas import DocumentUploadOut, DocumentParseResult

								from config import settings

								from dependencies import get_current_user


								router = APIRouter(prefix="/api/document", tags=["document"])


								@router.post("/upload", response_model=DocumentUploadOut)

								async def upload_document(

								    file: UploadFile = File(...),

								    request: Request = None,

								    user: dict = Depends(get_current_user),

								):

								    max_size = settings.MAX_UPLOAD_SIZE_MB * 1024 * 1024

								    content = await file.read()

								    if len(content) > max_size:

								        raise HTTPException(400, f"文件大小超过限制 ({settings.MAX_UPLOAD_SIZE_MB}MB)")


								    file_id = uuid.uuid4()

								    os.makedirs(settings.UPLOAD_DIR, exist_ok=True)


								    ext = os.path.splitext(file.filename or "unknown")[1]

								    stored_name = f"{file_id}{ext}"

								    file_path = os.path.join(settings.UPLOAD_DIR, stored_name)


								    with open(file_path, "wb") as f:

								        f.write(content)


								    return DocumentUploadOut(

								        file_id=file_id,

								        filename=file.filename or "unknown",

								        file_size=len(content),

								        content_type=file.content_type or "application/octet-stream",

								        upload_time=datetime.utcnow(),

								    )


								@router.post("/parse/{file_id}", response_model=DocumentParseResult)

								async def parse_document(

								    file_id: uuid.UUID,

								    request: Request,

								    db: AsyncSession = Depends(get_db),

								    user: dict = Depends(get_current_user),

								):

								    ext_map = {".txt", ".md", ".py", ".js", ".ts", ".json", ".xml", ".yaml", ".yml", ".csv", ".html", ".css", ".java", ".go", ".rs"}

								    os.makedirs(settings.UPLOAD_DIR, exist_ok=True)


								    found_file = None

								    found_filename = ""

								    for fname in os.listdir(settings.UPLOAD_DIR):

								        if fname.startswith(str(file_id)):

								            found_file = os.path.join(settings.UPLOAD_DIR, fname)

								            found_filename = fname

								            break


								    if not found_file:

								        raise HTTPException(404, "文件不存在")


								    ext = os.path.splitext(found_filename)[1].lower()

								    content = ""

								    metadata = {"file_size": os.path.getsize(found_file), "extension": ext}


								    if ext in ext_map:

								        with open(found_file, "r", encoding="utf-8", errors="replace") as f:

								            content = f.read()

								        metadata["lines"] = len(content.splitlines())

								        metadata["chars"] = len(content)

								    elif ext == ".pdf":

								        content = f"[PDF文档解析] 文件: {found_filename}"

								        metadata["type"] = "pdf"

								    elif ext in {".doc", ".docx"}:

								        content = f"[Word文档解析] 文件: {found_filename}"

								        metadata["type"] = "word"

								    elif ext in {".xls", ".xlsx"}:

								        content = f"[Excel文档解析] 文件: {found_filename}"

								        metadata["type"] = "excel"

								    else:

								        content = f"[不支持的文件类型 .{ext}] 文件: {found_filename}"

								        metadata["type"] = "unsupported"


								    audit = AuditLog(

								        operator_id=uuid.UUID(user["id"]),

								        action="document.parse",

								        resource="document",

								        resource_id=str(file_id),

								        detail={"filename": found_filename, "ext": ext},

								        ip_address=request.client.host if request.client else None,

								    )

								    db.add(audit)

								    await db.flush()


								    return DocumentParseResult(

								        file_id=file_id,

								        filename=found_filename,

								        content=content,

								        metadata=metadata,

								    )


								@router.delete("/{file_id}")

								async def delete_document(

								    file_id: uuid.UUID,

								    request: Request,

								    db: AsyncSession = Depends(get_db),

								    user: dict = Depends(get_current_user),

								):

								    os.makedirs(settings.UPLOAD_DIR, exist_ok=True)


								    deleted = False

								    for fname in os.listdir(settings.UPLOAD_DIR):

								        if fname.startswith(str(file_id)):

								            os.remove(os.path.join(settings.UPLOAD_DIR, fname))

								            deleted = True

								            break


								    if not deleted:

								        raise HTTPException(404, "文件不存在")


								    audit = AuditLog(

								        operator_id=uuid.UUID(user["id"]),

								        action="document.delete",

								        resource="document",

								        resource_id=str(file_id),

								        ip_address=request.client.host if request.client else None,

								    )

								    db.add(audit)

								    await db.flush()


								    return {"code": 200, "message": "已删除"}


								@router.post("/format")

								async def format_document(

								    payload: dict,

								    request: Request,

								    db: AsyncSession = Depends(get_db),

								    user: dict = Depends(get_current_user),

								):

								    content = payload.get("content", "")

								    format_type = payload.get("format_type", "standard")


								    result = _apply_formatting(content, format_type)


								    audit = AuditLog(

								        operator_id=uuid.UUID(user["id"]),

								        action="document.format",

								        resource="document",

								        resource_id=format_type,

								        detail={"format_type": format_type, "original_length": len(content)},

								        ip_address=request.client.host if request.client else None,

								    )

								    db.add(audit)

								    await db.flush()


								    return {"code": 200, "data": {"formatted": result, "format_type": format_type}}


								def _apply_formatting(content: str, format_type: str) -> str:

								    lines = content.splitlines()

								    result = []


								    if format_type == "standard":

								        for line in lines:

								            line = line.strip()

								            if line:

								                result.append(line)

								        return "\n\n".join(result)


								    elif format_type == "markdown":

								        result.append(f"# 格式化文档\n\n> 处理时间: {datetime.utcnow().isoformat()}\n")

								        for line in lines:

								            line = line.strip()

								            if line:

								                if line.startswith("#"):

								                    result.append(line)

								                elif len(line) < 60 and line.endswith((".", "。", "?", "？", "!", "！")):

								                    result.append(f"> {line}\n")

								                else:

								                    result.append(line)

								        return "\n\n".join(result)


								    elif format_type == "json":

								        import json

								        try:

								            parsed = json.loads(content)

								            return json.dumps(parsed, ensure_ascii=False, indent=2)

								        except json.JSONDecodeError:

								            return json.dumps({"content": content, "lines": len(lines)}, ensure_ascii=False, indent=2)


								    return content