import os import logging logger = logging.getLogger(__name__) _IMPORT_ERRORS: dict[str, str] = {} def _try_import_pdf() -> bool: global _IMPORT_ERRORS if "pdf" in _IMPORT_ERRORS: return False try: from PyPDF2 import PdfReader return True except ImportError: _IMPORT_ERRORS["pdf"] = "PyPDF2 未安装,无法解析 PDF" return False def _try_import_docx() -> bool: global _IMPORT_ERRORS if "docx" in _IMPORT_ERRORS: return False try: from docx import Document return True except ImportError: _IMPORT_ERRORS["docx"] = "python-docx 未安装,无法解析 Word 文档" return False def _try_import_excel() -> bool: global _IMPORT_ERRORS if "excel" in _IMPORT_ERRORS: return False try: import openpyxl return True except ImportError: _IMPORT_ERRORS["excel"] = "openpyxl 未安装,无法解析 Excel 文档" return False def parse_document(file_path: str, file_type: str = "auto") -> str: ext = os.path.splitext(file_path)[1].lower() if file_type == "auto": if ext in (".pdf",): file_type = "pdf" elif ext in (".docx", ".doc"): file_type = "word" elif ext in (".xlsx", ".xls"): file_type = "excel" elif ext in (".pptx", ".ppt"): file_type = "ppt" else: file_type = "text" if file_type == "pdf": if not _try_import_pdf(): return _IMPORT_ERRORS["pdf"] from PyPDF2 import PdfReader try: reader = PdfReader(file_path) texts = [] for page in reader.pages: t = page.extract_text() if t: texts.append(t) return "\n".join(texts) if texts else "(PDF 无可提取的文本内容)" except Exception as e: logger.error(f"PDF 解析失败: {e}") return f"PDF 解析失败: {e}" if file_type == "word": if not _try_import_docx(): return _IMPORT_ERRORS["docx"] from docx import Document try: doc = Document(file_path) texts = [p.text for p in doc.paragraphs if p.text.strip()] tables_text = [] for table in doc.tables: for row in table.rows: row_text = " | ".join(cell.text for cell in row.cells) tables_text.append(row_text) result = "\n".join(texts) if tables_text: result += "\n\n--- 表格内容 ---\n" + "\n".join(tables_text) return result or "(Word 文档无可提取的文本内容)" except Exception as e: logger.error(f"Word 解析失败: {e}") return f"Word 解析失败: {e}" if file_type == "excel": if not _try_import_excel(): return _IMPORT_ERRORS["excel"] import openpyxl try: wb = openpyxl.load_workbook(file_path, data_only=True) result_parts = [] for sheet_name in wb.sheetnames: ws = wb[sheet_name] result_parts.append(f"=== 工作表: {sheet_name} ===") for row in ws.iter_rows(values_only=True): row_text = " | ".join(str(c) if c is not None else "" for c in row) if row_text.strip(" |"): result_parts.append(row_text) return "\n".join(result_parts) if result_parts else "(Excel 无可提取的表格内容)" except Exception as e: logger.error(f"Excel 解析失败: {e}") return f"Excel 解析失败: {e}" if file_type in ("ppt", "pptx"): return "PPT 解析暂不支持,请将内容复制到 Word 或 PDF 后重试。" try: with open(file_path, "r", encoding="utf-8") as f: return f.read() except UnicodeDecodeError: try: with open(file_path, "r", encoding="gbk") as f: return f.read() except Exception: return f"无法以文本方式读取文件: {file_path}" except FileNotFoundError: return f"文件不存在: {file_path}" except Exception as e: logger.error(f"文档读取失败: {e}") return f"文档读取失败: {e}" def format_correction(content: str, format_rules: str = "standard") -> str: parts = [] parts.append(f"[格式规则: {format_rules}]\n") if format_rules == "standard" or format_rules == "enterprise": for line in content.split("\n"): stripped = line.strip() if stripped: parts.append(stripped) if format_rules == "enterprise": parts.insert(1, f"[发文机关] 企业AI平台") parts.insert(2, f"[密级] 内部") return "\n".join(parts) __all__ = ["parse_document", "format_correction"]